def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False): self.Docs = [] self.FullDocs = [] self.POS = [] self.FullPOS = [] self.PP = PostProcess() self.CC = ConsecutiveCapital() self.CN = ConsecutiveNouns() self.VB = VerbPhrase() self.significance = significance self.capitalize=capitalize self.TotalWords = 0 self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r')) self.out_path = out_path index = -1 with open(sentence_path, 'r') as f,\ open(pos_path, 'r') as g,\ open(full_sentence_path, 'r') as h,\ open(full_pos_path, 'r') as k: while True: sent1 = f.readline() sent1_full = h.readline() sent1_pos = g.readline().strip() sent1_full_pos = k.readline().strip() if not sent1: break doc_index, sent_index, seg_index, sent1 = sent1.split(":") full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":") doc_index = int(doc_index) sent_index = int(sent_index) seg_index = int(seg_index) while doc_index > index: index += 1 self.FullDocs.append([]) self.Docs.append([]) self.POS.append([]) self.FullPOS.append([]) split_sentence = sent1.strip().split() split_full_sentence = sent1_full.strip().split() split_pos_tags = sent1_pos.split() split_full_pos_tags = sent1_full_pos.split() self.TotalWords += len(split_sentence) if len(self.Docs[doc_index]) == sent_index: self.FullDocs[doc_index].append([]) self.Docs[doc_index].append([]) self.POS[doc_index].append([]) self.FullPOS[doc_index].append([]) self.Docs[doc_index][sent_index].append(split_sentence) self.FullDocs[doc_index][sent_index].append(split_full_sentence) self.POS[doc_index][sent_index].append(split_pos_tags) self.FullPOS[doc_index][sent_index].append(split_full_pos_tags) # load segmentor self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
class EntityRelation: def __init__(self, sentence_path, full_sentence_path, pos_path, full_pos_path, frequent_patterns_path, significance, out_path, capitalize=False): self.Docs = [] self.FullDocs = [] self.POS = [] self.FullPOS = [] self.PP = PostProcess() self.CC = ConsecutiveCapital() self.CN = ConsecutiveNouns() self.VB = VerbPhrase() self.significance = significance self.capitalize = capitalize self.TotalWords = 0 self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r')) self.out_path = out_path index = -1 with open(sentence_path, 'r') as f,\ open(pos_path, 'r') as g,\ open(full_sentence_path, 'r') as h,\ open(full_pos_path, 'r') as k: while True: sent1 = f.readline() sent1_full = h.readline() sent1_pos = g.readline().strip() sent1_full_pos = k.readline().strip() if not sent1: break doc_index, sent_index, seg_index, sent1 = sent1.split(":") full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split( ":") doc_index = int(doc_index) sent_index = int(sent_index) seg_index = int(seg_index) while doc_index > index: index += 1 self.FullDocs.append([]) self.Docs.append([]) self.POS.append([]) self.FullPOS.append([]) split_sentence = sent1.strip().split() split_full_sentence = sent1_full.strip().split() split_pos_tags = sent1_pos.split() split_full_pos_tags = sent1_full_pos.split() self.TotalWords += len(split_sentence) if len(self.Docs[doc_index]) == sent_index: self.FullDocs[doc_index].append([]) self.Docs[doc_index].append([]) self.POS[doc_index].append([]) self.FullPOS[doc_index].append([]) self.Docs[doc_index][sent_index].append(split_sentence) self.FullDocs[doc_index][sent_index].append( split_full_sentence) self.POS[doc_index][sent_index].append(split_pos_tags) self.FullPOS[doc_index][sent_index].append(split_full_pos_tags) # load segmentor self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords) def extract(self): out_path = self.out_path with open(out_path, 'w') as f: for i in xrange(len(self.Docs)): if i % 10000 == 0 and i != 0: print str(i) + " documents processed" doc = self.Docs[i] full_doc = self.FullDocs[i] pos_for_doc = self.POS[i] full_pos_for_doc = self.FullPOS[i] for j in xrange(len(doc)): sentence = doc[j] full_sentence = full_doc[j] sentence_pos = pos_for_doc[j] full_sentence_pos = full_pos_for_doc[j] final_sentence = [] for k in xrange(len(sentence)): seg = sentence[k] full_seg = full_sentence[k] pos = sentence_pos[k] full_seg_pos = full_sentence_pos[k] combined = [ seg[m] + ":" + pos[m] for m in xrange(len(pos)) ] final_result = [] if seg: #result = self.S.segment(sentence, pos) #result = self.S.pattern_segment([self.CC, self.CN], seg, pos) used_patterns = [self.CN] if self.capitalize: used_patterns.append(self.CC) result = self.S.pattern_segment( used_patterns, seg, pos) final_result = self.PP.reconstruct( result, full_seg, pos, full_seg_pos) else: final_result = full_seg final_result = ",".join(final_result) final_sentence.append(final_result) f.write(str(i) + "\t" + ",".join(final_sentence) + "\n")
def __init__(self, sentence_path, full_sentence_path, pos_path, full_pos_path, frequent_patterns_path, significance, out_path, capitalize=False): self.Docs = [] self.FullDocs = [] self.POS = [] self.FullPOS = [] self.PP = PostProcess() self.CC = ConsecutiveCapital() self.CN = ConsecutiveNouns() self.VB = VerbPhrase() self.significance = significance self.capitalize = capitalize self.TotalWords = 0 self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r')) self.out_path = out_path index = -1 with open(sentence_path, 'r') as f,\ open(pos_path, 'r') as g,\ open(full_sentence_path, 'r') as h,\ open(full_pos_path, 'r') as k: while True: sent1 = f.readline() sent1_full = h.readline() sent1_pos = g.readline().strip() sent1_full_pos = k.readline().strip() if not sent1: break doc_index, sent_index, seg_index, sent1 = sent1.split(":") full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split( ":") doc_index = int(doc_index) sent_index = int(sent_index) seg_index = int(seg_index) while doc_index > index: index += 1 self.FullDocs.append([]) self.Docs.append([]) self.POS.append([]) self.FullPOS.append([]) split_sentence = sent1.strip().split() split_full_sentence = sent1_full.strip().split() split_pos_tags = sent1_pos.split() split_full_pos_tags = sent1_full_pos.split() self.TotalWords += len(split_sentence) if len(self.Docs[doc_index]) == sent_index: self.FullDocs[doc_index].append([]) self.Docs[doc_index].append([]) self.POS[doc_index].append([]) self.FullPOS[doc_index].append([]) self.Docs[doc_index][sent_index].append(split_sentence) self.FullDocs[doc_index][sent_index].append( split_full_sentence) self.POS[doc_index][sent_index].append(split_pos_tags) self.FullPOS[doc_index][sent_index].append(split_full_pos_tags) # load segmentor self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
class EntityRelation: def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False): self.Docs = [] self.FullDocs = [] self.POS = [] self.FullPOS = [] self.PP = PostProcess() self.CC = ConsecutiveCapital() self.CN = ConsecutiveNouns() self.VB = VerbPhrase() self.significance = significance self.capitalize=capitalize self.TotalWords = 0 self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r')) self.out_path = out_path index = -1 with open(sentence_path, 'r') as f,\ open(pos_path, 'r') as g,\ open(full_sentence_path, 'r') as h,\ open(full_pos_path, 'r') as k: while True: sent1 = f.readline() sent1_full = h.readline() sent1_pos = g.readline().strip() sent1_full_pos = k.readline().strip() if not sent1: break doc_index, sent_index, seg_index, sent1 = sent1.split(":") full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":") doc_index = int(doc_index) sent_index = int(sent_index) seg_index = int(seg_index) while doc_index > index: index += 1 self.FullDocs.append([]) self.Docs.append([]) self.POS.append([]) self.FullPOS.append([]) split_sentence = sent1.strip().split() split_full_sentence = sent1_full.strip().split() split_pos_tags = sent1_pos.split() split_full_pos_tags = sent1_full_pos.split() self.TotalWords += len(split_sentence) if len(self.Docs[doc_index]) == sent_index: self.FullDocs[doc_index].append([]) self.Docs[doc_index].append([]) self.POS[doc_index].append([]) self.FullPOS[doc_index].append([]) self.Docs[doc_index][sent_index].append(split_sentence) self.FullDocs[doc_index][sent_index].append(split_full_sentence) self.POS[doc_index][sent_index].append(split_pos_tags) self.FullPOS[doc_index][sent_index].append(split_full_pos_tags) # load segmentor self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords) def extract(self): out_path = self.out_path with open(out_path, 'w') as f: for i in xrange(len(self.Docs)): if i%10000 ==0 and i!=0 : print str(i)+" documents processed" doc = self.Docs[i] full_doc = self.FullDocs[i] pos_for_doc = self.POS[i] full_pos_for_doc = self.FullPOS[i] for j in xrange(len(doc)): sentence = doc[j] full_sentence = full_doc[j] sentence_pos = pos_for_doc[j] full_sentence_pos = full_pos_for_doc[j] final_sentence = [] for k in xrange(len(sentence)): seg = sentence[k] full_seg = full_sentence[k] pos = sentence_pos[k] full_seg_pos = full_sentence_pos[k] combined = [seg[m]+":"+pos[m] for m in xrange(len(pos))] final_result = [] if seg: #result = self.S.segment(sentence, pos) #result = self.S.pattern_segment([self.CC, self.CN], seg, pos) used_patterns = [self.CN] if self.capitalize: used_patterns.append(self.CC) result = self.S.pattern_segment(used_patterns, seg, pos) final_result = self.PP.reconstruct(result,full_seg, pos, full_seg_pos) else: final_result = full_seg final_result = ",".join(final_result) final_sentence.append(final_result) f.write(str(i) + "\t" + ",".join(final_sentence)+"\n")
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import print_function, unicode_literals from Segmentor import Segmentor print('Loading NAER Segmentor ... ', ) print(' done.') def getSegResult(RawText): input_doc = RawText result = segmentor.procDoc(input_doc) return result if __name__ == "__main__": segmentor = Segmentor() RawText = ''' 市面上很少有「教科書設計」的專書,因為我們總覺得那是出版社的事! 然而,真的是這樣嗎? 教科書設計其實與課程綱要、教師的教學、學生的學習息息相關,是課程、教學、學習三位一體間一個重要的環節,除了有教育學與學科專業等內容涵納其中,也與編輯、版式等視覺設計元素的概念有關。 有鑒於此議題的重要,本院教科書發展中心邀請淡江大學課程與教學研究所陳麗華所長,於8月27日上午進行「教科書設計研究」專題演講,除了院內同仁,也邀請出版社編輯企劃相關人員參與。 ''' result = getSegResult(RawText) print(result)