Esempio n. 1
0
 def train(self):
     data = []
     with open(pos_path, encoding='utf8', mode='r') as f:
         for line in f.readlines():
             data.append([filter_stop(jieba.cut(line.strip())), 'pos'])
     with open(neg_path, encoding='utf8', mode='r') as f:
         for line in f.readlines():
             data.append([filter_stop(jieba.cut(line.strip())), 'neg'])
     self.classifier.train(data=data)
Esempio n. 2
0
def load_paper_data():
    data_list = []
    with open(paper_path, encoding='gb2312', mode='r', errors='ignore') as f:
        for line in f.readlines():
            t = filter_stop(filter(lambda x: x != '', map(lambda x: delete_punctuation(x.split('/')[0]), line.split()[1:])))
            if t:
                data_list.append(t)
    return data_list
Esempio n. 3
0
 def train(self):
     data = []
     for key, path in category_path.items():
         for txt_path in fetch_file_path(path):
             current_path = os.path.join(path, txt_path)
             with open(current_path, mode='r', encoding='gb2312', errors='ignore') as f:
                 data.append(
                     [filter_stop(jieba.cut(''.join((map(lambda x: x.strip().replace(' ', ''), f.readlines()))))),
                      key])
     self.classifier.train(data=data)
Esempio n. 4
0
 def simhash(self, doc1, doc2):
     d1 = filter_stop(jieba.cut(doc1))
     d2 = filter_stop(jieba.cut(doc2))
     d1_dict = {}
     d2_dict = {}
     for word in d1:
         if word in d1_dict:
             d1_dict[word] += 1
         else:
             d1_dict[word] = 1
     for word in d2:
         if word in d2_dict:
             d2_dict[word] += 1
         else:
             d2_dict[word] = 1
     n1 = self.merge_word(d1_dict)
     n2 = self.merge_word(d2_dict)
     t = hamming(n1, n2)
     print(t)
Esempio n. 5
0
 def predict(self, key):
     self.load()
     key = filter_stop(jieba.cut(key))
     dl, avgdl = self.get_dl_avgdl(key)
     K = self.get_K(dl, avgdl)
     max_doc_index, max_score = 0, 0
     for doc_index in range(self.tf_idf.tf_list.shape[0]):
         score = 0
         for k in key:
             one_word_score = self.word_score(k, doc_index, K)
             score += one_word_score
         if score > max_score:
             max_score, max_doc_index = score, doc_index
     print('max', max_doc_index, max_score)
     self.get_doc_word(max_doc_index)
Esempio n. 6
0
 def predict(self, sent):
     words = filter_stop(jieba.cut(sent))
     return self.classifier.predict(words)
Esempio n. 7
0
 def classify(self, sent):
     filter_words = filter_stop(jieba.cut(sent.strip()))
     return self.classifier.predict(filter_words)