def test_single2(self): sp = unicode(os.environ["SENNAPATH"]) sp = u"/Users/tuxedocat/Research/tools/senna/" parser = SennaParser(sp) txt = open("/Users/tuxedocat/Documents/workspace/Nyanco/sandbox/recognize100.txt").read().split("\n") txt = [s for s in txt if not s == ""] # testdata = open("/Users/tuxedocat/Documents/workspace/precure/src/test/sennatags.txt").read() testdata = [] for s in txt: testdata.append(parser.parseSentence(s)) for t in testdata: fe = SentenceFeatures(t, "recognize") fe.ngrams(n=5) fe.chunk() # fe.dependency() # fe.ne() # fe.bcv() # fe.srl() logging.debug(pformat((fe.SUF, fe.CHK, fe.NER))) logging.debug(" ".join(fe.SUF)) logging.debug(fe.v_idx) logging.debug(pformat(fe.features)) vec = DictVectorizer(sparse=True) array_f = vec.fit_transform(fe.features).toarray() # logging.debug(pformat(array_f)) raise Exception
def test_with_offset(self): self.testdata = [doc.split("\n") for doc in open(self.testpath_off).read().split("\n\n") if doc] fv = [] # print pformat(self.testdata) for t in self.testdata: fe = SentenceFeatures(t) fe.length() fe.bow() logging.debug(pformat(zip(fe.SUF, fe.POS))) logging.debug(pformat(fe.OFFSET)) logging.debug(pformat(fe.features)) fv.append(fe.features) vec = DictVectorizer(sparse=True) array_f = vec.fit_transform(fv).toarray() logging.debug(pformat(array_f)) raise Exception
def test_with_offset(self): self.testdata = [ doc.split("\n") for doc in open(self.testpath_off).read().split("\n\n") if doc ] fv = [] # print pformat(self.testdata) for t in self.testdata: fe = SentenceFeatures(t) fe.length() fe.bow() logging.debug(pformat(zip(fe.SUF, fe.POS))) logging.debug(pformat(fe.OFFSET)) logging.debug(pformat(fe.features)) fv.append(fe.features) vec = DictVectorizer(sparse=True) array_f = vec.fit_transform(fv).toarray() logging.debug(pformat(array_f)) raise Exception
def _get_features_tgt(self, v_corpus=None, cls2id=None, domain="tgt"): _flist = [] _labellist_int = [] _labellist_str = [] for sid, sdic in enumerate(v_corpus): v = sdic["label_corr"] _labelid = cls2id[v] try: fe = SentenceFeatures(sdic["parsed_corr"], verb=v, v_idx=sdic["vidx_corr"]) if "chunk" in self.featuretypes: fe.chunk() if "3gram" in self.featuretypes: fe.ngrams(n=3) if "5gram" in self.featuretypes: fe.ngrams(n=5) if "7gram" in self.featuretypes: fe.ngrams(n=7) if "dep" in self.featuretypes: fe.dependency() if "srl" in self.featuretypes: fe.srl() if "ne" in self.featuretypes: fe.ne() if "errorprob" in self.featuretypes: pass if "topic" in self.featuretypes: pass augf = proc_easyadapt(fe.features, domain=domain) assert augf and _labelid and v _flist.append(augf) _labellist_int.append(_labelid) _labellist_str.append(v) except ValueError: logging.debug(pformat("CaseMaker feature extraction: couldn't find the verb")) except: print v # else: # _flist.append(self.nullfeature) # _labellist_int.append(_labelid) # _labellist_str.append(v) return _flist, _labellist_str, _labellist_int
def get_features(tags=[], v="", v_idx=None, features=[]): fe = SentenceFeatures(tags=tags, verb=v, v_idx=v_idx) if "chunk" in features: fe.chunk() if "3gram" in features: fe.ngrams(n=3) if "5gram" in features: fe.ngrams(n=5) if "7gram" in features: fe.ngrams(n=7) if "dependency" in features: fe.dependency() if "ne" in features: fe.ne() if "srl" in features: fe.srl() if "topic" in features: fe.topic() if "errorprob" in features: fe.ep() # print pformat(fe.features) return proc_easyadapt(fe.features, domain="tgt")