Beispiel #1
0
 def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False):
     self.Docs = []
     self.FullDocs = []
     self.POS = []
     self.FullPOS = []
     self.PP = PostProcess()
     self.CC = ConsecutiveCapital()
     self.CN = ConsecutiveNouns()
     self.VB = VerbPhrase()
     self.significance = significance
     self.capitalize=capitalize
     self.TotalWords = 0
     self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r'))
     self.out_path = out_path
     index = -1
     with open(sentence_path, 'r') as f,\
             open(pos_path, 'r') as g,\
             open(full_sentence_path, 'r') as h,\
             open(full_pos_path, 'r') as k:
         while True:
             sent1 = f.readline()
             sent1_full = h.readline()
             sent1_pos = g.readline().strip()
             sent1_full_pos = k.readline().strip()
             if not sent1:
                 break
             doc_index, sent_index, seg_index, sent1 = sent1.split(":")
             full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":")
             doc_index = int(doc_index)
             sent_index = int(sent_index)
             seg_index = int(seg_index)
             while doc_index > index:
                 index += 1
                 self.FullDocs.append([])
                 self.Docs.append([])
                 self.POS.append([])
                 self.FullPOS.append([])
             split_sentence = sent1.strip().split()
             split_full_sentence = sent1_full.strip().split()
             split_pos_tags = sent1_pos.split()
             split_full_pos_tags = sent1_full_pos.split()
             self.TotalWords += len(split_sentence)
             if len(self.Docs[doc_index]) == sent_index:
                 self.FullDocs[doc_index].append([])
                 self.Docs[doc_index].append([])
                 self.POS[doc_index].append([])
                 self.FullPOS[doc_index].append([])
             self.Docs[doc_index][sent_index].append(split_sentence)
             self.FullDocs[doc_index][sent_index].append(split_full_sentence)
             self.POS[doc_index][sent_index].append(split_pos_tags)
             self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
     # load segmentor
     self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
Beispiel #2
0
class EntityRelation:
    def __init__(self,
                 sentence_path,
                 full_sentence_path,
                 pos_path,
                 full_pos_path,
                 frequent_patterns_path,
                 significance,
                 out_path,
                 capitalize=False):
        self.Docs = []
        self.FullDocs = []
        self.POS = []
        self.FullPOS = []
        self.PP = PostProcess()
        self.CC = ConsecutiveCapital()
        self.CN = ConsecutiveNouns()
        self.VB = VerbPhrase()
        self.significance = significance
        self.capitalize = capitalize
        self.TotalWords = 0
        self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r'))
        self.out_path = out_path
        index = -1
        with open(sentence_path, 'r') as f,\
                open(pos_path, 'r') as g,\
                open(full_sentence_path, 'r') as h,\
                open(full_pos_path, 'r') as k:
            while True:
                sent1 = f.readline()
                sent1_full = h.readline()
                sent1_pos = g.readline().strip()
                sent1_full_pos = k.readline().strip()
                if not sent1:
                    break
                doc_index, sent_index, seg_index, sent1 = sent1.split(":")
                full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(
                    ":")
                doc_index = int(doc_index)
                sent_index = int(sent_index)
                seg_index = int(seg_index)
                while doc_index > index:
                    index += 1
                    self.FullDocs.append([])
                    self.Docs.append([])
                    self.POS.append([])
                    self.FullPOS.append([])
                split_sentence = sent1.strip().split()
                split_full_sentence = sent1_full.strip().split()
                split_pos_tags = sent1_pos.split()
                split_full_pos_tags = sent1_full_pos.split()
                self.TotalWords += len(split_sentence)
                if len(self.Docs[doc_index]) == sent_index:
                    self.FullDocs[doc_index].append([])
                    self.Docs[doc_index].append([])
                    self.POS[doc_index].append([])
                    self.FullPOS[doc_index].append([])
                self.Docs[doc_index][sent_index].append(split_sentence)
                self.FullDocs[doc_index][sent_index].append(
                    split_full_sentence)
                self.POS[doc_index][sent_index].append(split_pos_tags)
                self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
        # load segmentor
        self.S = Segmentor(significance, self.frequent_patterns,
                           self.TotalWords)

    def extract(self):
        out_path = self.out_path
        with open(out_path, 'w') as f:
            for i in xrange(len(self.Docs)):
                if i % 10000 == 0 and i != 0:
                    print str(i) + " documents processed"

                doc = self.Docs[i]
                full_doc = self.FullDocs[i]
                pos_for_doc = self.POS[i]
                full_pos_for_doc = self.FullPOS[i]
                for j in xrange(len(doc)):
                    sentence = doc[j]
                    full_sentence = full_doc[j]
                    sentence_pos = pos_for_doc[j]
                    full_sentence_pos = full_pos_for_doc[j]
                    final_sentence = []
                    for k in xrange(len(sentence)):
                        seg = sentence[k]
                        full_seg = full_sentence[k]
                        pos = sentence_pos[k]
                        full_seg_pos = full_sentence_pos[k]
                        combined = [
                            seg[m] + ":" + pos[m] for m in xrange(len(pos))
                        ]
                        final_result = []
                        if seg:
                            #result = self.S.segment(sentence, pos)

                            #result = self.S.pattern_segment([self.CC, self.CN], seg, pos)
                            used_patterns = [self.CN]
                            if self.capitalize:
                                used_patterns.append(self.CC)
                            result = self.S.pattern_segment(
                                used_patterns, seg, pos)

                            final_result = self.PP.reconstruct(
                                result, full_seg, pos, full_seg_pos)
                        else:
                            final_result = full_seg

                        final_result = ",".join(final_result)
                        final_sentence.append(final_result)
                    f.write(str(i) + "\t" + ",".join(final_sentence) + "\n")
Beispiel #3
0
 def __init__(self,
              sentence_path,
              full_sentence_path,
              pos_path,
              full_pos_path,
              frequent_patterns_path,
              significance,
              out_path,
              capitalize=False):
     self.Docs = []
     self.FullDocs = []
     self.POS = []
     self.FullPOS = []
     self.PP = PostProcess()
     self.CC = ConsecutiveCapital()
     self.CN = ConsecutiveNouns()
     self.VB = VerbPhrase()
     self.significance = significance
     self.capitalize = capitalize
     self.TotalWords = 0
     self.frequent_patterns = pickle.load(open(frequent_patterns_path, 'r'))
     self.out_path = out_path
     index = -1
     with open(sentence_path, 'r') as f,\
             open(pos_path, 'r') as g,\
             open(full_sentence_path, 'r') as h,\
             open(full_pos_path, 'r') as k:
         while True:
             sent1 = f.readline()
             sent1_full = h.readline()
             sent1_pos = g.readline().strip()
             sent1_full_pos = k.readline().strip()
             if not sent1:
                 break
             doc_index, sent_index, seg_index, sent1 = sent1.split(":")
             full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(
                 ":")
             doc_index = int(doc_index)
             sent_index = int(sent_index)
             seg_index = int(seg_index)
             while doc_index > index:
                 index += 1
                 self.FullDocs.append([])
                 self.Docs.append([])
                 self.POS.append([])
                 self.FullPOS.append([])
             split_sentence = sent1.strip().split()
             split_full_sentence = sent1_full.strip().split()
             split_pos_tags = sent1_pos.split()
             split_full_pos_tags = sent1_full_pos.split()
             self.TotalWords += len(split_sentence)
             if len(self.Docs[doc_index]) == sent_index:
                 self.FullDocs[doc_index].append([])
                 self.Docs[doc_index].append([])
                 self.POS[doc_index].append([])
                 self.FullPOS[doc_index].append([])
             self.Docs[doc_index][sent_index].append(split_sentence)
             self.FullDocs[doc_index][sent_index].append(
                 split_full_sentence)
             self.POS[doc_index][sent_index].append(split_pos_tags)
             self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
     # load segmentor
     self.S = Segmentor(significance, self.frequent_patterns,
                        self.TotalWords)
                        startTime = time.time()
                        sc.Solve()
                        endTime = time.time()
                        oh.WriteToLog(
                            mc, "Total time elapsed for solving: " +
                            str(endTime - startTime))

                        completed = dc.time / sc.tEnd
                    except:
                        err = traceback.format_exc()  # Traceback of error
                        oh.WriteToLog(mc, err)

                    try:
                        # Postprocess
                        from PostProcess import PostProcess
                        PostProcess(mc)
                    except:
                        err = traceback.format_exc()  # Traceback of error
                        oh.WriteToLog(mc, err)

                    strWrite = '{: <16s}'.format('E_inner')     + '{: <16f}'.format(p.E_inner) + '\n' +\
                         '{: <16s}'.format('E_ratio')     + '{: <16.6f}'.format(p.E_ratio) + '\n' +\
                         '{: <16s}'.format('kth1_i')      + '{: <16.6f}'.format(p.kth1_i) + '\n' +\
                         '{: <16s}'.format('kth2_i')      + '{: <16.6f}'.format(p.kth2_i) + '\n' +\
                         '{: <16s}'.format('kth1_o')      + '{: <16.6f}'.format(p.kth1_o) + '\n' +\
                         '{: <16s}'.format('kth2_o')      + '{: <16.6f}'.format(p.kth2_o) + '\n' +\
                         '{: <16s}'.format('adv_R_alpha') + '{: <16.6f}'.format(p.adv_R_alpha) + '\n' +\
                         '{: <16s}'.format('Completed')  + '{: <16.6f}'.format(completed)
                    oh.WriteToOutput(mc, 'info.txt', strWrite)

                    os.chdir(mc.folderMain)
Beispiel #5
0
class EntityRelation:
    def __init__(self,sentence_path, full_sentence_path, pos_path,full_pos_path, frequent_patterns_path,significance,out_path,capitalize=False):
        self.Docs = []
        self.FullDocs = []
        self.POS = []
        self.FullPOS = []
        self.PP = PostProcess()
        self.CC = ConsecutiveCapital()
        self.CN = ConsecutiveNouns()
        self.VB = VerbPhrase()
        self.significance = significance
        self.capitalize=capitalize
        self.TotalWords = 0
        self.frequent_patterns = pickle.load(open(frequent_patterns_path,'r'))
        self.out_path = out_path
        index = -1
        with open(sentence_path, 'r') as f,\
                open(pos_path, 'r') as g,\
                open(full_sentence_path, 'r') as h,\
                open(full_pos_path, 'r') as k:
            while True:
                sent1 = f.readline()
                sent1_full = h.readline()
                sent1_pos = g.readline().strip()
                sent1_full_pos = k.readline().strip()
                if not sent1:
                    break
                doc_index, sent_index, seg_index, sent1 = sent1.split(":")
                full_doc_index, full_doc_sent_index, full_doc_seg_index, sent1_full = sent1_full.split(":")
                doc_index = int(doc_index)
                sent_index = int(sent_index)
                seg_index = int(seg_index)
                while doc_index > index:
                    index += 1
                    self.FullDocs.append([])
                    self.Docs.append([])
                    self.POS.append([])
                    self.FullPOS.append([])
                split_sentence = sent1.strip().split()
                split_full_sentence = sent1_full.strip().split()
                split_pos_tags = sent1_pos.split()
                split_full_pos_tags = sent1_full_pos.split()
                self.TotalWords += len(split_sentence)
                if len(self.Docs[doc_index]) == sent_index:
                    self.FullDocs[doc_index].append([])
                    self.Docs[doc_index].append([])
                    self.POS[doc_index].append([])
                    self.FullPOS[doc_index].append([])
                self.Docs[doc_index][sent_index].append(split_sentence)
                self.FullDocs[doc_index][sent_index].append(split_full_sentence)
                self.POS[doc_index][sent_index].append(split_pos_tags)
                self.FullPOS[doc_index][sent_index].append(split_full_pos_tags)
        # load segmentor
        self.S = Segmentor(significance, self.frequent_patterns, self.TotalWords)
    def extract(self):
        out_path = self.out_path
        with open(out_path, 'w') as f:
            for i in xrange(len(self.Docs)):
                if i%10000 ==0 and i!=0 : print str(i)+" documents processed"

                doc = self.Docs[i]
                full_doc = self.FullDocs[i]
                pos_for_doc = self.POS[i]
                full_pos_for_doc = self.FullPOS[i]
                for j in xrange(len(doc)):
                    sentence = doc[j]
                    full_sentence = full_doc[j]
                    sentence_pos = pos_for_doc[j]
                    full_sentence_pos = full_pos_for_doc[j]
                    final_sentence = []
                    for k in xrange(len(sentence)):
                        seg = sentence[k]

                        full_seg = full_sentence[k]
                        pos = sentence_pos[k]
                        full_seg_pos = full_sentence_pos[k]
                        combined = [seg[m]+":"+pos[m] for m in xrange(len(pos))]
                        final_result = []
                        if seg:
                            #result = self.S.segment(sentence, pos)

                            #result = self.S.pattern_segment([self.CC, self.CN], seg, pos)
                            used_patterns = [self.CN]
                            if self.capitalize:
                                used_patterns.append(self.CC)
                            result = self.S.pattern_segment(used_patterns, seg, pos)

                            final_result = self.PP.reconstruct(result,full_seg, pos, full_seg_pos)
                        else:
                            final_result = full_seg

                        final_result = ",".join(final_result)
                        final_sentence.append(final_result)
                    f.write(str(i) + "\t" + ",".join(final_sentence)+"\n")
Beispiel #6
0
 def OnInit(self):
     wx.InitAllImageHandlers()
     pp = PostProcess(None, -1, "")
     self.SetTopWindow(pp)
     pp.Show()
     return 1