def convertStringtoPattern(xcur): ## need to integrate the 'virtual level' lRes=[] for elt in xcur: if isinstance(elt,list): lRes.extend([convertStringtoPattern(elt)]) else: try: float(elt) f= featureObject() f.setName("x") f.setType(featureObject.NUMERICAL) f.setValue(elt) f.setObjectName(elt) f.setWeight(1) f.setTH(self.THNUMERICAL) except: f= featureObject() f.setName("f") f.setType(featureObject.EDITDISTANCE) f.setValue(elt) f.setTH(100.0) lRes.append(f) return lRes
def getSetOfVInfoFeatures(self, TH, lAttributes, myObject): """ """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() for attr in lAttributes: name = attr[0].getName() value = attr[0].getValue() feature = featureObject() feature.setName(name) feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(value) feature.setType(feature.NUMERICAL) self.addFeature(feature) if self.getSetofFeatures() == []: feature = featureObject() feature.setName('EMPTY') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(True) feature.setType(featureObject.BOOLEAN) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfListedAttributes(self, TH, lAttributes, myObject): """ move to XMLObjectClass ?? Generate a set of features: """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() lHisto = {} for elt in self.getAllNamedObjects(myObject): for attr in lAttributes: try: lHisto[attr] except KeyError: lHisto[attr] = {} if elt.hasAttribute(attr): try: lHisto[attr][round(float( elt.getAttribute(attr)))].append(elt) except KeyError: lHisto[attr][round(float( elt.getAttribute(attr)))] = [elt] if lHisto != {}: for attr in lAttributes: for value in lHisto[attr]: if len(lHisto[attr][value]) > 0.1: ftype = featureObject.NUMERICAL feature = featureObject() feature.setName(attr) # feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(value) feature.setType(ftype) self.addFeature(feature) if 'virtual' in lAttributes: ftype = featureObject.BOOLEAN feature = featureObject() feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(self.getAttribute('virtual')) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfFeaturesBB(self, TH, lAttributes, myObject): """ features: BB X Y H W """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() #build BB if self.getBB() is None: self.addBoundingBox() x, y, h, w = self.getBB() feature = featureObject() feature.setName('lm') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self.getName()) feature.setValue(round(x)) feature.setType(feature.NUMERICAL) self.addFeature(feature) feature = featureObject() feature.setName('rm') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self.getName()) feature.setValue(round(x + w)) feature.setType(feature.NUMERICAL) self.addFeature(feature) if self.getSetofFeatures() == []: feature = featureObject() feature.setName('EMPTY') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self.getName()) feature.setValue(True) feature.setType(featureObject.BOOLEAN) self.addFeature(feature) return self.getSetofFeatures()
def getContentAnaphoraAttributes(self,TH,lAttributes,myObject): """ textual content + position wrt parent """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() lHisto = {} lHisto['position']={} for elt in self.getAllNamedObjects(myObject): ## if elt is first in elt.getParent() position= elt.getParent().getObjects().index(elt) if position == len(elt.getParent().getObjects())-1:position=-1 try:lHisto['position'][str(position)].append(elt) except KeyError : lHisto['position'][str(position)]= [elt] for attr in lAttributes: try:lHisto[attr] except KeyError:lHisto[attr] = {} if elt.hasAttribute(attr): try: try:lHisto[attr][round(float(elt.getAttribute(attr)))].append(elt) except KeyError: lHisto[attr][round(float(elt.getAttribute(attr)))] = [elt] except TypeError:pass elif attr == 'text': try:lHisto[attr][elt.getContent()].append(elt) except KeyError: lHisto[attr][elt.getContent()] = [elt] for attr in lAttributes: for value in lHisto[attr]: if len(lHisto[attr][value]) > 0.1: # 0.1: keep all! if attr not in ['position','text']: ftype= featureObject.NUMERICAL else: ftype= featureObject.EDITDISTANCE feature = featureObject() feature.setName(attr) # feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(value) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfFeaturesPageSize(self, TH, lAttributes, myObject): """ features: BB X Y H W """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() feature = featureObject() feature.setName('h') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(float(self.getAttribute('height')))) feature.setType(feature.NUMERICAL) self.addFeature(feature) feature = featureObject() feature.setName('w') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(float(self.getAttribute('width')))) feature.setType(feature.NUMERICAL) self.addFeature(feature) if self.getSetofFeatures() == []: feature = featureObject() feature.setName('EMPTY') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(True) feature.setType(featureObject.BOOLEAN) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfFeaturesXPos(self, TH, lAttr, myObject): from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] ftype = featureObject.NUMERICAL feature = featureObject() feature.setName('x') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(self.getX())) feature.setType(ftype) self.addFeature(feature) ftype = featureObject.NUMERICAL feature = featureObject() feature.setName('x2') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(self.getX() + self.getWidth())) feature.setType(ftype) self.addFeature(feature) ftype = featureObject.NUMERICAL feature = featureObject() feature.setName('xc') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(self.getX() + self.getWidth() / 2)) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfListedAttributes(self, TH, lAttributes, myObject): """ Generate a set of features: X start of the lines """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() lHisto = {} for elt in self.getAllNamedObjects(myObject): for attr in lAttributes: try: lHisto[attr] except KeyError: lHisto[attr] = {} if elt.hasAttribute(attr): # if elt.getWidth() >500: # print elt.getName(),attr, elt.getAttribute(attr) #, elt.getNode() try: try: lHisto[attr][round(float( elt.getAttribute(attr)))].append(elt) except KeyError: lHisto[attr][round(float( elt.getAttribute(attr)))] = [elt] except TypeError: pass for attr in lAttributes: for value in lHisto[attr]: # print attr, value, lHisto[attr][value] if len(lHisto[attr][value]) > 0.1: ftype = featureObject.NUMERICAL feature = featureObject() feature.setName(attr) # feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(value) feature.setType(ftype) self.addFeature(feature) if 'text' in lAttributes: if len(self.getContent()): ftype = featureObject.EDITDISTANCE feature = featureObject() # feature.setName('content') feature.setName('f') feature.setTH(90) feature.addNode(self) feature.setObjectName(self) feature.setValue(self.getContent().split()[0]) feature.setType(ftype) self.addFeature(feature) if 'tokens' in lAttributes: if len(self.getContent()): for token in self.getContent().split(): if len(token) > 4: ftype = featureObject.EDITDISTANCE feature = featureObject() feature.setName('token') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(token.lower()) feature.setType(ftype) self.addFeature(feature) if 'xc' in lAttributes: ftype = featureObject.NUMERICAL feature = featureObject() # feature.setName('xc') feature.setName('xc') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(round(self.getX() + self.getWidth() / 2)) feature.setType(ftype) self.addFeature(feature) # if 'virtual' in lAttributes: ftype = featureObject.BOOLEAN feature = featureObject() feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(self.getAttribute('virtual')) feature.setType(ftype) self.addFeature(feature) if 'bl' in lAttributes: for inext in self.next: ftype = featureObject.NUMERICAL feature = featureObject() baseline = self.getBaseline() nbl = inext.getBaseline() if baseline and nbl: feature.setName('bl') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) # avg of baseline? avg1 = baseline.getY() + (baseline.getY2() - baseline.getY()) / 2 avg2 = nbl.getY() + (nbl.getY2() - nbl.getY()) / 2 feature.setValue(round(abs(avg2 - avg1))) feature.setType(ftype) self.addFeature(feature) if 'linegrid' in lAttributes: #lgridlist.append((ystart,rowH, y1,yoverlap)) for ystart, rowh, _, _ in self.lgridlist: ftype = featureObject.BOOLEAN feature = featureObject() feature.setName('linegrid%s' % rowh) feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(ystart) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfListedAttributes(self, TH, lAttributes, myObject): """ Generate a set of features: X start of the lines """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() if 'virtual' in lAttributes: ftype = featureObject.BOOLEAN feature = featureObject() feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(self.getAttribute('virtual')) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures() lHisto = {} for elt in self.getAllNamedObjects(myObject): if float(elt.getAttribute('width')) > 0: for attr in lAttributes: try: lHisto[attr] except KeyError: lHisto[attr] = {} if elt.hasAttribute(attr): try: lHisto[attr][round(float( elt.getAttribute(attr)))].append(elt) except: lHisto[attr][round(float( elt.getAttribute(attr)))] = [elt] for attr in lAttributes: for value in lHisto[attr]: if len(lHisto[attr][value]) > 0.1: ftype = featureObject.NUMERICAL feature = featureObject() feature.setName(attr) l = sum(x.getHeight() for x in lHisto[attr][value]) feature.setWeight(l) feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(value) feature.setType(ftype) self.addFeature(feature) if self.getSetofFeatures() == []: feature = featureObject() feature.setName('EMPTY') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(True) feature.setType(featureObject.BOOLEAN) self.addFeature(feature) return self.getSetofFeatures()
def getSetOfListedAttributes(self, TH, lAttributes, myObject): """ move to XMLObjectClass ?? Generate a set of features: """ from spm.feature import featureObject if self._lBasicFeatures is None: self._lBasicFeatures = [] # needed to keep canonical values! elif self.getSetofFeatures() != []: return self.getSetofFeatures() lHisto = {} for elt in self.getAllNamedObjects(myObject): for attr in lAttributes: try: lHisto[attr] except KeyError: lHisto[attr] = {} if elt.hasAttribute(attr): try: try: lHisto[attr][round(float( elt.getAttribute(attr)))].append(elt) except KeyError: lHisto[attr][round(float( elt.getAttribute(attr)))] = [elt] except TypeError: pass # empty object if lHisto == {}: return self.getSetofFeatures() for attr in lAttributes: for value in lHisto[attr]: # print attr, value, lHisto[attr][value] if len(lHisto[attr][value]) > 0.1: ftype = featureObject.NUMERICAL feature = featureObject() feature.setName(attr) # feature.setName('f') feature.setTH(TH) for o in lHisto[attr][value]: feature.addNode(o) feature.setObjectName(self) feature.setValue(value) feature.setType(ftype) self.addFeature(feature) if 'tokens' in lAttributes: if len(self.getContent()): for token in self.getContent().split(): if len(token) > 4: ftype = featureObject.EDITDISTANCE feature = featureObject() feature.setName('token') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(token.lower()) feature.setType(ftype) self.addFeature(feature) # if 'virtual' in lAttributes: ftype = featureObject.BOOLEAN feature = featureObject() feature.setName('f') feature.setTH(TH) feature.addNode(self) feature.setObjectName(self) feature.setValue(self.getAttribute('virtual')) feature.setType(ftype) self.addFeature(feature) return self.getSetofFeatures()