class SimpleNLP(object): def __init__(self, method=1, doc=None, datalist=None): self.doc = doc self.datalist = datalist self.seg = Seg() self.sentiment = Sentiment(method) self.method = method def seg_datalist(self): return self.seg.seg_from_datalist(self.datalist) def seg_doc(self): return self.seg.seg_from_doc(self.doc) def get_keyword_datalist(self): return dict(self.seg.get_keyword_from_datalist(self.datalist)) def sentiment_analysis_doc(self): if self.method == 1: self.sentiment.load_model(root_path + '/data/naivebayes_model30000v3') elif self.method == 2: self.sentiment.load_model(root_path + '/data/svmmodel10000v4') return self.sentiment.predict_sentence_doc(self.doc) def sentiment_analysis_datalist(self): if self.method == 1: self.sentiment.load_model(root_path + '/data/naivebayes_model30000v3') elif self.method == 2: self.sentiment.load_model(root_path + '/data/svmmodel10000v4') return self.sentiment.predict_datalist(self.datalist)
def __init__(self, jieba=False, stanford=True): if jieba: self.json = json from seg import AllInfo from stanford import Stanford self.w = AllInfo() self.s = None if stanford: self.st = Stanford(False) else: self.dep = Dep() else: self.p = re.compile(u"\u25aa") self.json = json self.dep = Dep() from seg import Seg from stanford import Stanford self.w = Seg() self.s = Stanford(False)
def main(): doc = '''杰森我爱你!加油你是最棒的!''' start_time = time.time() datalist = Seg().get_data_from_mysql(5, 0) npl = SimpleNLP(1, doc, datalist) print(npl.seg_doc()) print(npl.seg_datalist()) keyword = npl.get_keyword_datalist() print(keyword) print(len(keyword)) '''
class Sentiment: def __init__(self): self.classifier = Bayes() self.seg = Seg() self.seg.load('seg.pickle') def save(self, fname): self.classifier.save(fname) def load(self, fname): self.classifier = self.classifier.load(fname) def handle(self, doc): words = self.seg.seg(doc) words = self.filter_stop(words) return words def train(self, neg_docs, pos_docs): datas = [] for doc in neg_docs: datas.append([self.handle(doc), 'neg']) for doc in pos_docs: datas.append([self.handle(doc), 'pos']) self.classifier.train(datas) def classify(self, doc): ret, prob = self.classifier.classify(self.handle(doc)) if ret == 'pos': return prob else: return 1 - prob @staticmethod def filter_stop(words): return list(filter(lambda x: x not in stop_words, words))
def Analysis(lyric, mod=True): if mod == False: pos = [] neg = [] with open( "D:\\Academic_work\\01_ERG3010\\Project\\corpus\\doubandata.txt", 'r', encoding='utf-8-sig') as f: for line in f: line = f.readline() line = line.split("##") try: star = int(line[1]) except: pass if star == 1 or star == 2: neg.append(line[2].strip('\n')) elif star == 4 or star == 5: pos.append(line[2].strip('\n')) ''' segment ''' seg_pos = Seg().seg_from_datalist(pos) seg_neg = Seg().seg_from_datalist(neg) ''' training & test ''' word_list = [] lable_list = [] data = [] train_data = [] shuffle(seg_pos) shuffle(seg_neg) for k in seg_pos[:500]: train_data.append(('pos', k)) word_list.append(k) lable_list.append('pos') for k in seg_neg[:500]: train_data.append(('neg', k)) word_list.append(k) lable_list.append('neg') ''' train, test''' fe = FeatureExtraction(word_list, lable_list) best_words = fe.best_words(3000, False) best_words = "D:\Academic_work\01_ERG3010\Project\lyricsAnalysis2\svmmodel-bestwords.dat" model = Sentiment(best_words) model.train_model(train_data) model.save_model(root_path + "\\lyricsAnalysis2\\svmmodel") else: model = Sentiment() model.load_model(root_path + "\\lyricsAnalysis2\\svmmodel") result = model.predict_datalist(lyric) # lyric 是一个list, 放每一首歌曲 data = [] count = 1 for prob in result: time = "{}/{}".format((count // 12), count // 30) data.append([count, prob, "Pos"]) data.append([count, 1 - prob, "Neg"]) count += 1 ''' text visualization ''' tr = ThemeRiver("Sentiment", title_color="#274C77", title_text_size=20) tr.add(['Pos', 'Neg'], data, is_label_show=True, is_datazoom_show=True, legend_text_color="#274C77", legend_text_size=15) tr.render("ThemeRiver.html")
class SVM(object): def __init__(self, c, best_words): self.seg = Seg() self.clf = SVC(probability=True, C=c) self.train_data = [] self.train_label = [] self.best_words = best_words def words2vector(self, all_data): vectors = [] for data in all_data: vector = [] for feature in self.best_words: vector.append(data.count(feature)) vectors.append(vector) # print(vector) vectors = np.array(vectors) return vectors def train_model(self, data): print("------ SVM Classifier is training ------") for d in data: label = d[0] doc = d[1] self.train_data.append(doc) self.train_label.append(label) self.train_data = np.array(self.train_data) self.train_label = np.array(self.train_label) train_vectors = self.words2vector(self.train_data) self.clf.fit(train_vectors, self.train_label) print("------ SVM Classifier training over ------") def save_model(self, filename): print("------ SVM Classifier is saving model ------") joblib.dump(self.clf, filename+'-model.m') f = gzip.open(filename + '-bestwords.dat', 'wb') d = {} d['best words'] = self.best_words f.write(pickle.dumps(d)) f.close() print("------ SVM Classifier saving model over ------") def load_model(self, filename): print("------ SVM Classifier is loading model ------") self.clf = joblib.load(filename+'-model.m') f = gzip.open(filename+'-bestwords.dat', 'rb') d = pickle.loads(f.read()) f.close() self.best_words = d['best words'] print("------ SVM Classifier loading model over ------") def predict_wordlist(self, sentence): vector = self.words2vector([sentence]) prediction = self.clf.predict(vector) prob = self.clf.predict_proba(vector)[0][1] return prediction[0], prob def predict_sentence(self, sentence): seged_sentence = self.seg.seg_from_doc(sentence) prediction, prob = self.predict_wordlist(seged_sentence) return prediction, prob def predict_datalist(self, datalist): seged_datalist = self.seg.seg_from_datalist(datalist) result = [] for data in seged_datalist: prediction, prob = self.predict_wordlist(data) result.append(prob) return result
def __init__(self, c, best_words): self.seg = Seg() self.clf = SVC(probability=True, C=c) self.train_data = [] self.train_label = [] self.best_words = best_words
def get_seg(self, fname='seg.pickle'): seg = Seg() seg.load(fname) return seg
def __init__(self): self.classifier = Bayes() self.seg = Seg() self.seg.load('seg.pickle')
class Merge: # @jieba True means it will use jieba to ner #'cause the result of ner using jieba is not good enough,so @jieba must be False till i got a new way to get a better result.that means self.s always is not None def __init__(self, jieba=False, stanford=True): if jieba: self.json = json from seg import AllInfo from stanford import Stanford self.w = AllInfo() self.s = None if stanford: self.st = Stanford(False) else: self.dep = Dep() else: self.p = re.compile(u"\u25aa") self.json = json self.dep = Dep() from seg import Seg from stanford import Stanford self.w = Seg() self.s = Stanford(False) # ner,pos must like nn,nr,vv,ww # dep must like word_id@@word dep+head\t # the input is a str(result in line) def _merge_with_str(self, line_ner, line_pos, line_dep, line_seg): ner = line_ner.split(",") pos = line_pos.split(",") if line_dep is not None: deps = line_dep.split("\t") line = "" lens = len(ner) - 1 for dep in deps: info = dep.split("@@") id = int(info[0]) if id > lens: continue line += info[1].decode("gbk") line += " " + pos[id] + "\t" # line += ' '+ner[id]+' '+pos[id]+'\t' line = line.strip("\t") return line else: seg = line_seg.split(" ") line = "" if len(seg) != len(pos): print line_seg.encode("utf-8") print line_pos for id in xrange(len(ner)): if ner[id] != "O": seg[id] = ner[id] # line += seg[id] + ' ' + ner[id]+' '+pos[id]+'\t' line += seg[id] + " " + pos[id] + "\t" line = line.strip("\t") return line # this method is for processing the json def _process(self, line_json): decoded = self.json.loads(line_json) line_ner = decoded["ner"] line_pos = decoded["pos"] line_seg = decoded["seg"] return (line_ner, line_pos, line_seg) # this method is for getting all info of a line,without merging them in a line(return tuple) def _process_line(self, line_json): (line_ner, line_pos, line_seg) = self._process(line_json) line_dep = self.dep.dep_from_line(line_seg.encode("gbk")) deps = line_dep.split("\t") line = "" for dep in deps: info = dep.split("@@") info = info[1].split(" ") line += info[1].decode("gbk") + " " line = line.strip(" ") return (line_ner, line_pos, line_seg, line) # this method will parse the line and merge all info # the method will be used when i just have the json(including seg ner and pos) form indri # it can get the dep from stanford can merge it into a line # so it should not be used right now def merge(self, line_json, dep=False): if dep: (line_ner, line_pos, line_seg) = self._process(line_json) line_seg = self.p.sub(".", line_seg) line_dep = self.dep.dep_from_line(line_seg.encode("gbk")) line = self._merge_with_str(line_ner, line_pos, line_dep, None) return line else: (line_ner, line_pos, line_seg) = self._process(line_json) line = self._merge_with_str(line_ner, line_pos, None, line_seg) return line def add_new_words(self, newwords): self.w.add_words(newwords) def ner_using_nlpc(self, line): (line_seg, pos, ner) = self.w.getInfo(line) line_ner = self.dep._dep_line(line_seg.encode("gbk", "ignore")) sner = line_ner.split("\t") # (line_seg,line_pos,line_ner) = self.dep._dep_all(line.encode('gbk','ignore')) if len(ner) != len(sner): return ("", "", "") for i in xrange(len(ner)): j = ner[i] if j != "other": sner[i] = j return ("\t".join(line_seg.split(" ")), "\t".join(pos), "\t".join(sner)) # return (line_seg,line_pos,line_ner) # this method is for get a json of a line # now it's for testing # i need to get the json(including seg ner and pos) form indri,and then use stanford to get the dep # BTW,I don't have to use dep anymore,so the method won't return dep # so,if stanford(True),the method must change def _get_line_json(self, line): if self.s is not None: dict = {"seg": "", "ner": "", "pos": ""} dict["seg"] = self.w.seg(line) (dict["ner"], dict["pos"]) = self.s.get_ner_pos(dict["seg"]) return self.json.dumps(dict) else: (line_seg, pos, ner) = self.w.getInfo(line) (line_ner, line_pos) = self.st.get_ner_pos(line_seg) sner = line_ner.split(",") if len(ner) != len(sner): return ("", "", "") # print line_seg.encode('utf-8') # print ','.join(ner) # print line_ner for i in xrange(len(ner)): j = ner[i] if j != "other": sner[i] = j # print ','.join(sner) return (line_seg, line_pos, ",".join(sner)) # @dep False means without dep def get_line_info(self, line_json, dep=False): if self.s is not None: if dep: return self._process_line(line_json) else: (line_ner, line_pos, line_seg) = self._process(line_json) return (line_ner, line_pos, line_seg, None) else: (line_seg, line_pos, line_ner) = self._get_line_json(line_json) print line_seg print line_pos return (",".join(line_ner), ",".join(line_pos), " ".join(line_seg), None) # this method is for testing # it will use the jieba and standford tool to get the ner and pos's results. Then, transform them into json and use the method 'merge' to test whether it print the correct pattern # correct pattern:word dep(use ',' to split all result of parsing) ner pos\tword dep ner pos def test(self): for line in sys.stdin: line = line.strip("\n") # print line line_json = self._get_line_json(line) line = self.merge(line_json) print line.encode("utf-8") # this method is for testing def test2(self): for line in sys.stdin: line = line.strip("\n") if self.s is not None: line_json = self._get_line_json(line) (line_ner, line_pos, line_seg, line_dep) = self.get_line_info(line_json, False) print line_ner print line_pos print line_seg.encode("utf-8") # print line_dep.encode('utf-8') else: (line_ner, line_pos, line_seg, line_dep) = self.get_line_info(line, False) print line_seg.encode("utf-8") print line_ner print line_pos
def __init__(self, best_words=None): self.svm = SVM(50, best_words) self.seg = Seg()
def __init__(self, method=1, doc=None, datalist=None): self.doc = doc self.datalist = datalist self.seg = Seg() self.sentiment = Sentiment(method) self.method = method
from seg import Seg ''' 1, 使用直接估计法估计HMM的参数 2,使用 viterbi 算法选择概率最大的分词序列 ''' segger = Seg() # 澳/b 门/e 的/s 回/b 归/e 一/b 定/e 能/b 够/e 顺/b 利/e 实/b 现/e 。/s def train(fname): datas = [] i = 0 with open(fname, 'r', encoding='utf-8') as f: for line in f: if i == 10000: break line = line.rstrip() if not line: continue tmp = list(map(lambda x: x.split('/'), line.split())) datas.append(tmp) i += 1 segger.train(datas)