def syntactic_parse(pos_list, grammar): parser = ViterbiParser(grammar) try: grammar.check_coverage(pos_list) except ValueError: return None for tree in parser.parse(pos_list): return tree
def parse(self, grammar, trace=0): """ Parse the sentence with the given grammar using the nltk viterbi parser. Return the best parse and its score. :param grammar: the (adapted) WeightedGrammar object to parse with :param trace: determines the output of the parser. """ # print grammar parser = ViterbiParser(grammar) parser.trace(trace) parses = parser.nbest_parse(self.tokens) #return the best parse return parses[0]
def __init__(self, the_grammar): print('Initialization of ParserSeger...') self.parser=ViterbiParser(the_grammar) self.max_word_len=4 print('done')
"state": { "equation": { "id": "eqn", "value": sin, "contentEditable": True }, }, }) print(reqReq) print("test input:", sin) print("test output:", reqReq.json()) fdir = "/Users/gabriel/" fname = "examples2" parser = ViterbiParser(grammar) with open(fdir + fname) as fin: sentences = set() mod = 0 for eq in fin: eq = eq.split("\t")[2] eq = eq.lower() eq = filter(lambda ch: ch in "()1234567890+-*/.;=x", eq) proc = "" for i in eq: proc += i past = False for sin in proc.split(";"):
eq = eq.lower() eq = filter(lambda ch: ch in "()1234567890+-*/.;=x", eq) proc = "" for i in eq: proc += i for sin in proc.split(";"): print(sin) sentences.add(sin) sents = [[c for c in s] for s in sentences] prob_inducer = pCFG_Grammar() prob_inducer.grammar = grammar parser = ViterbiParser(grammar) test = '(5-9x)-5+18=-2/7x' #test = '5+18=2/7x' for i in parser.parse([i for i in test]): print(i) print("INDUCING WEIGHTS:") prob_inducer.induce_weights(sentences) grammar = prob_inducer.grammar print("GRAMMAR:") print(grammar)
def tree_features(tree, path): ret = [] node_str = "" for l in tree.leaves(): node_str += l ret.append((('tree-label', path), tree.label())) ret.append((('value', path), node_str)) print(len(tree)) if len(tree) < 2: return ret left_rt = tree_features(tree[0], ('left-tree', path)) right_rt = tree_features(tree[1], ('right-tree', path)) return ret + left_rt + right_rt if __name__ == "__main__": print(grammar) parser = ViterbiParser(grammar) sent = [c for c in "-7/2"] print(sent) for tree in parser.parse(sent): print(tree) print(tree_features(tree, 'some attribute'))
class ParserSeger(): def __init__(self, the_grammar): print('Initialization of ParserSeger...') self.parser=ViterbiParser(the_grammar) self.max_word_len=4 print('done') def score(self, word_candidate): # log (prob, 2) as score #return -1000 parseTree_list=self.parser.nbest_parse(word_candidate) if parseTree_list: # if there is any parse return parseTree_list[0].logprob() else: return -1000 #2*-1000== almost zero def viterbi_segment(self, sentence): if 0: print('Current sent',sentence) print(len(sentence)) print('***') BestSeg={} #key: end_of_partial_sentence (python index style) #value: (best_segmentation, segmentation_score) BestSeg[0]=([],1) for i in range(1, len(sentence)+1): print(i) best_score=-1000000 best_ptr=0 for j in range(-1, -(min(i+1,len(sentence)+1, self.max_word_len+1)),-1): word=sentence[i+j:i] word_score=self.score(word) seg_score=-1000000 if i+j>0: best_sub_seg_record=BestSeg[i+j] best_sub_seg=best_sub_seg_record[0] best_sub_score=best_sub_seg_record[1] seg_score=best_sub_score+word_score else: seg_score=word_score if seg_score>best_score: best_ptr=i+j best_score=seg_score b=BestSeg[best_ptr] best_seg=copy.copy(b[0]) best_seg.append(''.join(sentence[best_ptr:i])) BestSeg[i]=(best_seg, best_score) final_seg_record=BestSeg[len(sentence)] final_seg=final_seg_record[0] #print(final_seg) return final_seg def segment_corpus(self, corpus): Result=[] sent_count=0 for sent in corpus: if sent_count%int(len(corpus)/100)==0: print(math.ceil(sent_count/len(corpus)*100),'% finished...') char_list=re.findall('\S',sent, re.U)# even if it is a gold standard corpus, we use the raw form (discarding the original segmentation) #char_seq=l.replace(" ", "") #print('XXXchar_seq',char_seq) result=self.viterbi_segment(char_list) Result.append(result) sent_count += 1 path_out='../working_data/base_seg.out' f3=codecs.open(path_out, 'w', 'utf-8') print('\nPriting out segmented corpus to file ', path_out) for r in Result: f3.write(' '.join(r)+'\n') print('\n',' --- Segmentation Done! --- ') print('# of sentence being segmented:', sent_count) print('# of sentence in test corpus(see whether it matches last num):', len(lines))