Example #1
0
 def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False):
     self.Docs = []
     self.FullDocs = []
     self.POS = []
     self.FullPOS = []
     self.PP = PostProcess()
     self.CC = ConsecutiveCapital()
     self.CN = ConsecutiveNouns()
     self.VB = VerbPhrase()
     self.significance = significance
     self.capitalize=capitalize
     self.TotalWords = 0
     self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r'))
     self.out_path = out_path
     index = -1
     with open(sentence_path, 'r') as f,\
             open(pos_path, 'r') as g,\
             open(full_sentence_path, 'r') as h,\
             open(full_pos_path, 'r') as k:
         while True:
             sent1 = f.readline()
             sent1_full = h.readline()
             sent1_pos = g.readline().strip()
             sent1_full_pos = k.readline().strip()
             if not sent1:
                 break
             doc_index, sent_index, seg_index, sent1 = sent1.split(":")
             full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":")
             doc_index = int(doc_index)
             sent_index = int(sent_index)
             seg_index = int(seg_index)
             while doc_index > index:
                 index += 1
                 self.FullDocs.append([])
                 self.Docs.append([])
                 self.POS.append([])
                 self.FullPOS.append([])
             split_sentence = sent1.strip().split()
             split_full_sentence = sent1_full.strip().split()
             split_pos_tags = sent1_pos.split()
             split_full_pos_tags = sent1_full_pos.split()
             self.TotalWords += len(split_sentence)
             if len(self.Docs[doc_index]) == sent_index:
                 self.FullDocs[doc_index].append([])
                 self.Docs[doc_index].append([])
                 self.POS[doc_index].append([])
                 self.FullPOS[doc_index].append([])
             self.Docs[doc_index][sent_index].append(split_sentence)
             self.FullDocs[doc_index][sent_index].append(split_full_sentence)
             self.POS[doc_index][sent_index].append(split_pos_tags)
             self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
     # load segmentor
     self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
Example #2
0
class EntityRelation:
    def __init__(self,
                 sentence_path,
                 full_sentence_path,
                 pos_path,
                 full_pos_path,
                 frequent_patterns_path,
                 significance,
                 out_path,
                 capitalize=False):
        self.Docs = []
        self.FullDocs = []
        self.POS = []
        self.FullPOS = []
        self.PP = PostProcess()
        self.CC = ConsecutiveCapital()
        self.CN = ConsecutiveNouns()
        self.VB = VerbPhrase()
        self.significance = significance
        self.capitalize = capitalize
        self.TotalWords = 0
        self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r'))
        self.out_path = out_path
        index = -1
        with open(sentence_path, 'r') as f,\
                open(pos_path, 'r') as g,\
                open(full_sentence_path, 'r') as h,\
                open(full_pos_path, 'r') as k:
            while True:
                sent1 = f.readline()
                sent1_full = h.readline()
                sent1_pos = g.readline().strip()
                sent1_full_pos = k.readline().strip()
                if not sent1:
                    break
                doc_index, sent_index, seg_index, sent1 = sent1.split(":")
                full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(
                    ":")
                doc_index = int(doc_index)
                sent_index = int(sent_index)
                seg_index = int(seg_index)
                while doc_index > index:
                    index += 1
                    self.FullDocs.append([])
                    self.Docs.append([])
                    self.POS.append([])
                    self.FullPOS.append([])
                split_sentence = sent1.strip().split()
                split_full_sentence = sent1_full.strip().split()
                split_pos_tags = sent1_pos.split()
                split_full_pos_tags = sent1_full_pos.split()
                self.TotalWords += len(split_sentence)
                if len(self.Docs[doc_index]) == sent_index:
                    self.FullDocs[doc_index].append([])
                    self.Docs[doc_index].append([])
                    self.POS[doc_index].append([])
                    self.FullPOS[doc_index].append([])
                self.Docs[doc_index][sent_index].append(split_sentence)
                self.FullDocs[doc_index][sent_index].append(
                    split_full_sentence)
                self.POS[doc_index][sent_index].append(split_pos_tags)
                self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
        # load segmentor
        self.S = Segmentor(significance, self.frequent_patterns,
                           self.TotalWords)

    def extract(self):
        out_path = self.out_path
        with open(out_path, 'w') as f:
            for i in xrange(len(self.Docs)):
                if i % 10000 == 0 and i != 0:
                    print str(i) + " documents processed"

                doc = self.Docs[i]
                full_doc = self.FullDocs[i]
                pos_for_doc = self.POS[i]
                full_pos_for_doc = self.FullPOS[i]
                for j in xrange(len(doc)):
                    sentence = doc[j]
                    full_sentence = full_doc[j]
                    sentence_pos = pos_for_doc[j]
                    full_sentence_pos = full_pos_for_doc[j]
                    final_sentence = []
                    for k in xrange(len(sentence)):
                        seg = sentence[k]
                        full_seg = full_sentence[k]
                        pos = sentence_pos[k]
                        full_seg_pos = full_sentence_pos[k]
                        combined = [
                            seg[m] + ":" + pos[m] for m in xrange(len(pos))
                        ]
                        final_result = []
                        if seg:
                            #result = self.S.segment(sentence, pos)

                            #result = self.S.pattern_segment([self.CC, self.CN], seg, pos)
                            used_patterns = [self.CN]
                            if self.capitalize:
                                used_patterns.append(self.CC)
                            result = self.S.pattern_segment(
                                used_patterns, seg, pos)

                            final_result = self.PP.reconstruct(
                                result, full_seg, pos, full_seg_pos)
                        else:
                            final_result = full_seg

                        final_result = ",".join(final_result)
                        final_sentence.append(final_result)
                    f.write(str(i) + "\t" + ",".join(final_sentence) + "\n")
Example #3
0
 def __init__(self,
              sentence_path,
              full_sentence_path,
              pos_path,
              full_pos_path,
              frequent_patterns_path,
              significance,
              out_path,
              capitalize=False):
     self.Docs = []
     self.FullDocs = []
     self.POS = []
     self.FullPOS = []
     self.PP = PostProcess()
     self.CC = ConsecutiveCapital()
     self.CN = ConsecutiveNouns()
     self.VB = VerbPhrase()
     self.significance = significance
     self.capitalize = capitalize
     self.TotalWords = 0
     self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r'))
     self.out_path = out_path
     index = -1
     with open(sentence_path, 'r') as f,\
             open(pos_path, 'r') as g,\
             open(full_sentence_path, 'r') as h,\
             open(full_pos_path, 'r') as k:
         while True:
             sent1 = f.readline()
             sent1_full = h.readline()
             sent1_pos = g.readline().strip()
             sent1_full_pos = k.readline().strip()
             if not sent1:
                 break
             doc_index, sent_index, seg_index, sent1 = sent1.split(":")
             full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(
                 ":")
             doc_index = int(doc_index)
             sent_index = int(sent_index)
             seg_index = int(seg_index)
             while doc_index > index:
                 index += 1
                 self.FullDocs.append([])
                 self.Docs.append([])
                 self.POS.append([])
                 self.FullPOS.append([])
             split_sentence = sent1.strip().split()
             split_full_sentence = sent1_full.strip().split()
             split_pos_tags = sent1_pos.split()
             split_full_pos_tags = sent1_full_pos.split()
             self.TotalWords += len(split_sentence)
             if len(self.Docs[doc_index]) == sent_index:
                 self.FullDocs[doc_index].append([])
                 self.Docs[doc_index].append([])
                 self.POS[doc_index].append([])
                 self.FullPOS[doc_index].append([])
             self.Docs[doc_index][sent_index].append(split_sentence)
             self.FullDocs[doc_index][sent_index].append(
                 split_full_sentence)
             self.POS[doc_index][sent_index].append(split_pos_tags)
             self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
     # load segmentor
     self.S = Segmentor(significance, self.frequent_patterns,
                        self.TotalWords)
Example #4
0
class EntityRelation:
    def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False):
        self.Docs = []
        self.FullDocs = []
        self.POS = []
        self.FullPOS = []
        self.PP = PostProcess()
        self.CC = ConsecutiveCapital()
        self.CN = ConsecutiveNouns()
        self.VB = VerbPhrase()
        self.significance = significance
        self.capitalize=capitalize
        self.TotalWords = 0
        self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r'))
        self.out_path = out_path
        index = -1
        with open(sentence_path, 'r') as f,\
                open(pos_path, 'r') as g,\
                open(full_sentence_path, 'r') as h,\
                open(full_pos_path, 'r') as k:
            while True:
                sent1 = f.readline()
                sent1_full = h.readline()
                sent1_pos = g.readline().strip()
                sent1_full_pos = k.readline().strip()
                if not sent1:
                    break
                doc_index, sent_index, seg_index, sent1 = sent1.split(":")
                full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":")
                doc_index = int(doc_index)
                sent_index = int(sent_index)
                seg_index = int(seg_index)
                while doc_index > index:
                    index += 1
                    self.FullDocs.append([])
                    self.Docs.append([])
                    self.POS.append([])
                    self.FullPOS.append([])
                split_sentence = sent1.strip().split()
                split_full_sentence = sent1_full.strip().split()
                split_pos_tags = sent1_pos.split()
                split_full_pos_tags = sent1_full_pos.split()
                self.TotalWords += len(split_sentence)
                if len(self.Docs[doc_index]) == sent_index:
                    self.FullDocs[doc_index].append([])
                    self.Docs[doc_index].append([])
                    self.POS[doc_index].append([])
                    self.FullPOS[doc_index].append([])
                self.Docs[doc_index][sent_index].append(split_sentence)
                self.FullDocs[doc_index][sent_index].append(split_full_sentence)
                self.POS[doc_index][sent_index].append(split_pos_tags)
                self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
        # load segmentor
        self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
    def extract(self):
        out_path = self.out_path
        with open(out_path, 'w') as f:
            for i in xrange(len(self.Docs)):
                if i%10000 ==0 and i!=0 : print str(i)+" documents processed"

                doc = self.Docs[i]
                full_doc = self.FullDocs[i]
                pos_for_doc = self.POS[i]
                full_pos_for_doc = self.FullPOS[i]
                for j in xrange(len(doc)):
                    sentence = doc[j]
                    full_sentence = full_doc[j]
                    sentence_pos = pos_for_doc[j]
                    full_sentence_pos = full_pos_for_doc[j]
                    final_sentence = []
                    for k in xrange(len(sentence)):
                        seg = sentence[k]

                        full_seg = full_sentence[k]
                        pos = sentence_pos[k]
                        full_seg_pos = full_sentence_pos[k]
                        combined = [seg[m]+":"+pos[m] for m in xrange(len(pos))]
                        final_result = []
                        if seg:
                            #result = self.S.segment(sentence, pos)

                            #result = self.S.pattern_segment([self.CC, self.CN], seg, pos)
                            used_patterns = [self.CN]
                            if self.capitalize:
                                used_patterns.append(self.CC)
                            result = self.S.pattern_segment(used_patterns, seg, pos)

                            final_result = self.PP.reconstruct(result,full_seg, pos, full_seg_pos)
                        else:
                            final_result = full_seg

                        final_result = ",".join(final_result)
                        final_sentence.append(final_result)
                    f.write(str(i) + "\t" + ",".join(final_sentence)+"\n")
Example #5
0
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function, unicode_literals
from Segmentor import Segmentor

print('Loading NAER Segmentor ... ', )
print(' done.')


def getSegResult(RawText):
    input_doc = RawText
    result = segmentor.procDoc(input_doc)
    return result


if __name__ == "__main__":
    segmentor = Segmentor()
    RawText = '''
市面上很少有「教科書設計」的專書,因為我們總覺得那是出版社的事!
然而,真的是這樣嗎?

教科書設計其實與課程綱要、教師的教學、學生的學習息息相關,是課程、教學、學習三位一體間一個重要的環節,除了有教育學與學科專業等內容涵納其中,也與編輯、版式等視覺設計元素的概念有關。
有鑒於此議題的重要,本院教科書發展中心邀請淡江大學課程與教學研究所陳麗華所長,於8月27日上午進行「教科書設計研究」專題演講,除了院內同仁,也邀請出版社編輯企劃相關人員參與。
'''

    result = getSegResult(RawText)
    print(result)