Beispiel #1
0
class SimpleNLP(object):
    def __init__(self, method=1, doc=None, datalist=None):
        self.doc = doc
        self.datalist = datalist
        self.seg = Seg()
        self.sentiment = Sentiment(method)
        self.method = method

    def seg_datalist(self):
        return self.seg.seg_from_datalist(self.datalist)

    def seg_doc(self):
        return self.seg.seg_from_doc(self.doc)

    def get_keyword_datalist(self):
        return dict(self.seg.get_keyword_from_datalist(self.datalist))

    def sentiment_analysis_doc(self):
        if self.method == 1:
            self.sentiment.load_model(root_path +
                                      '/data/naivebayes_model30000v3')
        elif self.method == 2:
            self.sentiment.load_model(root_path + '/data/svmmodel10000v4')
        return self.sentiment.predict_sentence_doc(self.doc)

    def sentiment_analysis_datalist(self):
        if self.method == 1:
            self.sentiment.load_model(root_path +
                                      '/data/naivebayes_model30000v3')
        elif self.method == 2:
            self.sentiment.load_model(root_path + '/data/svmmodel10000v4')
        return self.sentiment.predict_datalist(self.datalist)
    def __init__(self, jieba=False, stanford=True):
        if jieba:
            self.json = json
            from seg import AllInfo
            from stanford import Stanford

            self.w = AllInfo()
            self.s = None
            if stanford:
                self.st = Stanford(False)
            else:
                self.dep = Dep()
        else:
            self.p = re.compile(u"\u25aa")
            self.json = json
            self.dep = Dep()
            from seg import Seg
            from stanford import Stanford

            self.w = Seg()
            self.s = Stanford(False)
Beispiel #3
0
def main():
    doc = '''杰森我爱你!加油你是最棒的!'''
    start_time = time.time()
    datalist = Seg().get_data_from_mysql(5, 0)
    npl = SimpleNLP(1, doc, datalist)
    print(npl.seg_doc())
    print(npl.seg_datalist())

    keyword = npl.get_keyword_datalist()
    print(keyword)
    print(len(keyword))
    '''
Beispiel #4
0
class Sentiment:
    def __init__(self):
        self.classifier = Bayes()
        self.seg = Seg()
        self.seg.load('seg.pickle')

    def save(self, fname):
        self.classifier.save(fname)

    def load(self, fname):
        self.classifier = self.classifier.load(fname)

    def handle(self, doc):
        words = self.seg.seg(doc)
        words = self.filter_stop(words)

        return words

    def train(self, neg_docs, pos_docs):
        datas = []
        for doc in neg_docs:
            datas.append([self.handle(doc), 'neg'])
        for doc in pos_docs:
            datas.append([self.handle(doc), 'pos'])

        self.classifier.train(datas)

    def classify(self, doc):
        ret, prob = self.classifier.classify(self.handle(doc))
        if ret == 'pos':
            return prob
        else:
            return 1 - prob

    @staticmethod
    def filter_stop(words):
        return list(filter(lambda x: x not in stop_words, words))
def Analysis(lyric, mod=True):
    if mod == False:
        pos = []
        neg = []
        with open(
                "D:\\Academic_work\\01_ERG3010\\Project\\corpus\\doubandata.txt",
                'r',
                encoding='utf-8-sig') as f:
            for line in f:
                line = f.readline()
                line = line.split("##")
                try:
                    star = int(line[1])
                except:
                    pass
                if star == 1 or star == 2:
                    neg.append(line[2].strip('\n'))
                elif star == 4 or star == 5:
                    pos.append(line[2].strip('\n'))
        ''' segment '''
        seg_pos = Seg().seg_from_datalist(pos)
        seg_neg = Seg().seg_from_datalist(neg)
        ''' training & test  '''
        word_list = []
        lable_list = []
        data = []
        train_data = []
        shuffle(seg_pos)
        shuffle(seg_neg)
        for k in seg_pos[:500]:
            train_data.append(('pos', k))
            word_list.append(k)
            lable_list.append('pos')
        for k in seg_neg[:500]:
            train_data.append(('neg', k))
            word_list.append(k)
            lable_list.append('neg')
        ''' train, test'''
        fe = FeatureExtraction(word_list, lable_list)
        best_words = fe.best_words(3000, False)
        best_words = "D:\Academic_work\01_ERG3010\Project\lyricsAnalysis2\svmmodel-bestwords.dat"
        model = Sentiment(best_words)
        model.train_model(train_data)
        model.save_model(root_path + "\\lyricsAnalysis2\\svmmodel")
    else:
        model = Sentiment()
        model.load_model(root_path + "\\lyricsAnalysis2\\svmmodel")

    result = model.predict_datalist(lyric)  # lyric 是一个list, 放每一首歌曲
    data = []
    count = 1
    for prob in result:
        time = "{}/{}".format((count // 12), count // 30)
        data.append([count, prob, "Pos"])
        data.append([count, 1 - prob, "Neg"])
        count += 1
    ''' text visualization '''
    tr = ThemeRiver("Sentiment", title_color="#274C77", title_text_size=20)
    tr.add(['Pos', 'Neg'],
           data,
           is_label_show=True,
           is_datazoom_show=True,
           legend_text_color="#274C77",
           legend_text_size=15)
    tr.render("ThemeRiver.html")
Beispiel #6
0
class SVM(object):
    def __init__(self, c, best_words):
        self.seg = Seg()
        self.clf = SVC(probability=True, C=c)
        self.train_data = []
        self.train_label = []
        self.best_words = best_words

    def words2vector(self, all_data):
        vectors = []
        for data in all_data:
            vector = []
            for feature in self.best_words:
                vector.append(data.count(feature))
            vectors.append(vector)
            # print(vector)
        vectors = np.array(vectors)
        return vectors

    def train_model(self, data):
        print("------ SVM Classifier is training ------")
        for d in data:
            label = d[0]
            doc = d[1]
            self.train_data.append(doc)
            self.train_label.append(label)

        self.train_data = np.array(self.train_data)
        self.train_label = np.array(self.train_label)

        train_vectors = self.words2vector(self.train_data)
        self.clf.fit(train_vectors, self.train_label)

        print("------ SVM Classifier training over ------")

    def save_model(self, filename):
        print("------ SVM Classifier is saving model ------")
        joblib.dump(self.clf, filename+'-model.m')
        f = gzip.open(filename + '-bestwords.dat', 'wb')
        d = {}
        d['best words'] = self.best_words
        f.write(pickle.dumps(d))
        f.close()
        print("------ SVM Classifier saving model over ------")

    def load_model(self, filename):
        print("------ SVM Classifier is loading model ------")
        self.clf = joblib.load(filename+'-model.m')

        f = gzip.open(filename+'-bestwords.dat', 'rb')
        d = pickle.loads(f.read())
        f.close()
        self.best_words = d['best words']
        print("------ SVM Classifier loading model over ------")

    def predict_wordlist(self, sentence):
        vector = self.words2vector([sentence])
        prediction = self.clf.predict(vector)
        prob = self.clf.predict_proba(vector)[0][1]
        return prediction[0], prob

    def predict_sentence(self, sentence):
        seged_sentence = self.seg.seg_from_doc(sentence)
        prediction, prob = self.predict_wordlist(seged_sentence)
        return prediction, prob

    def predict_datalist(self, datalist):
        seged_datalist = self.seg.seg_from_datalist(datalist)
        result = []
        for data in seged_datalist:
            prediction, prob = self.predict_wordlist(data)
            result.append(prob)
        return result
Beispiel #7
0
 def __init__(self, c, best_words):
     self.seg = Seg()
     self.clf = SVC(probability=True, C=c)
     self.train_data = []
     self.train_label = []
     self.best_words = best_words
Beispiel #8
0
 def get_seg(self, fname='seg.pickle'):
     seg = Seg()
     seg.load(fname)
     return seg
Beispiel #9
0
 def __init__(self):
     self.classifier = Bayes()
     self.seg = Seg()
     self.seg.load('seg.pickle')
class Merge:

    # @jieba True means it will use jieba to ner
    #'cause the result of ner using jieba is not good enough,so @jieba must be False till i got a new way to get a better result.that means self.s always is not None
    def __init__(self, jieba=False, stanford=True):
        if jieba:
            self.json = json
            from seg import AllInfo
            from stanford import Stanford

            self.w = AllInfo()
            self.s = None
            if stanford:
                self.st = Stanford(False)
            else:
                self.dep = Dep()
        else:
            self.p = re.compile(u"\u25aa")
            self.json = json
            self.dep = Dep()
            from seg import Seg
            from stanford import Stanford

            self.w = Seg()
            self.s = Stanford(False)

            # ner,pos must like nn,nr,vv,ww
            # dep	 must like word_id@@word dep+head\t
            # the input is a str(result in line)

    def _merge_with_str(self, line_ner, line_pos, line_dep, line_seg):
        ner = line_ner.split(",")
        pos = line_pos.split(",")
        if line_dep is not None:
            deps = line_dep.split("\t")
            line = ""
            lens = len(ner) - 1
            for dep in deps:
                info = dep.split("@@")
                id = int(info[0])
                if id > lens:
                    continue
                line += info[1].decode("gbk")
                line += " " + pos[id] + "\t"
                # line += ' '+ner[id]+' '+pos[id]+'\t'
            line = line.strip("\t")
            return line
        else:
            seg = line_seg.split(" ")
            line = ""
            if len(seg) != len(pos):
                print line_seg.encode("utf-8")
                print line_pos
            for id in xrange(len(ner)):
                if ner[id] != "O":
                    seg[id] = ner[id]
                    # line += seg[id] + ' ' + ner[id]+' '+pos[id]+'\t'
                line += seg[id] + " " + pos[id] + "\t"
            line = line.strip("\t")
            return line

            # this method is  for processing the json

    def _process(self, line_json):
        decoded = self.json.loads(line_json)
        line_ner = decoded["ner"]
        line_pos = decoded["pos"]
        line_seg = decoded["seg"]
        return (line_ner, line_pos, line_seg)

        # this method is for getting all info of a line,without merging them in a line(return tuple)

    def _process_line(self, line_json):
        (line_ner, line_pos, line_seg) = self._process(line_json)
        line_dep = self.dep.dep_from_line(line_seg.encode("gbk"))
        deps = line_dep.split("\t")
        line = ""
        for dep in deps:
            info = dep.split("@@")
            info = info[1].split(" ")
            line += info[1].decode("gbk") + " "
        line = line.strip(" ")
        return (line_ner, line_pos, line_seg, line)

        # this method will parse the line and merge all info
        # the method will be used when i just have the json(including seg ner and pos) form indri
        # it can get the dep from stanford can merge it into a line
        # so it should not be used right now

    def merge(self, line_json, dep=False):
        if dep:
            (line_ner, line_pos, line_seg) = self._process(line_json)
            line_seg = self.p.sub(".", line_seg)
            line_dep = self.dep.dep_from_line(line_seg.encode("gbk"))
            line = self._merge_with_str(line_ner, line_pos, line_dep, None)
            return line
        else:
            (line_ner, line_pos, line_seg) = self._process(line_json)
            line = self._merge_with_str(line_ner, line_pos, None, line_seg)
            return line

    def add_new_words(self, newwords):
        self.w.add_words(newwords)

    def ner_using_nlpc(self, line):
        (line_seg, pos, ner) = self.w.getInfo(line)
        line_ner = self.dep._dep_line(line_seg.encode("gbk", "ignore"))
        sner = line_ner.split("\t")
        # (line_seg,line_pos,line_ner) = self.dep._dep_all(line.encode('gbk','ignore'))
        if len(ner) != len(sner):
            return ("", "", "")
        for i in xrange(len(ner)):
            j = ner[i]
            if j != "other":
                sner[i] = j
        return ("\t".join(line_seg.split(" ")), "\t".join(pos), "\t".join(sner))
        # return (line_seg,line_pos,line_ner)

        # this method is for get a json of a line
        # now it's for testing
        # i need to get the json(including seg ner and pos) form indri,and then use stanford to get the dep
        # BTW,I don't have to use dep anymore,so the method won't return dep
        # so,if stanford(True),the method must change

    def _get_line_json(self, line):
        if self.s is not None:
            dict = {"seg": "", "ner": "", "pos": ""}
            dict["seg"] = self.w.seg(line)
            (dict["ner"], dict["pos"]) = self.s.get_ner_pos(dict["seg"])
            return self.json.dumps(dict)
        else:
            (line_seg, pos, ner) = self.w.getInfo(line)
            (line_ner, line_pos) = self.st.get_ner_pos(line_seg)
            sner = line_ner.split(",")
            if len(ner) != len(sner):
                return ("", "", "")
                # print line_seg.encode('utf-8')
                # print ','.join(ner)
                # print line_ner
            for i in xrange(len(ner)):
                j = ner[i]
                if j != "other":
                    sner[i] = j
                    # print ','.join(sner)
            return (line_seg, line_pos, ",".join(sner))

            # @dep False means without dep

    def get_line_info(self, line_json, dep=False):
        if self.s is not None:
            if dep:
                return self._process_line(line_json)
            else:
                (line_ner, line_pos, line_seg) = self._process(line_json)
                return (line_ner, line_pos, line_seg, None)
        else:
            (line_seg, line_pos, line_ner) = self._get_line_json(line_json)
            print line_seg
            print line_pos
            return (",".join(line_ner), ",".join(line_pos), " ".join(line_seg), None)

            # this method is for testing
            # it will use the jieba and standford tool to get the ner and pos's results. Then, transform them into json and use the method 'merge' to test whether it print the correct pattern
            # correct pattern:word dep(use ',' to split all result of parsing) ner pos\tword dep  ner pos

    def test(self):
        for line in sys.stdin:
            line = line.strip("\n")
            # print line
            line_json = self._get_line_json(line)
            line = self.merge(line_json)
            print line.encode("utf-8")

            # this method is for testing

    def test2(self):
        for line in sys.stdin:
            line = line.strip("\n")
            if self.s is not None:
                line_json = self._get_line_json(line)
                (line_ner, line_pos, line_seg, line_dep) = self.get_line_info(line_json, False)
                print line_ner
                print line_pos
                print line_seg.encode("utf-8")
                # print line_dep.encode('utf-8')
            else:
                (line_ner, line_pos, line_seg, line_dep) = self.get_line_info(line, False)
                print line_seg.encode("utf-8")
                print line_ner
                print line_pos
 def __init__(self, best_words=None):
     self.svm = SVM(50, best_words)
     self.seg = Seg()
Beispiel #12
0
 def __init__(self, method=1, doc=None, datalist=None):
     self.doc = doc
     self.datalist = datalist
     self.seg = Seg()
     self.sentiment = Sentiment(method)
     self.method = method
Beispiel #13
0
from seg import Seg

'''
1, 使用直接估计法估计HMM的参数
2,使用 viterbi 算法选择概率最大的分词序列

'''

segger = Seg()


# 澳/b 门/e 的/s 回/b 归/e 一/b 定/e 能/b 够/e 顺/b 利/e 实/b 现/e 。/s
def train(fname):
    datas = []
    i = 0
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            if i == 10000:
                break

            line = line.rstrip()
            if not line:
                continue

            tmp = list(map(lambda x: x.split('/'), line.split()))

            datas.append(tmp)

            i += 1

    segger.train(datas)