def XPathTF(etl, data): from lxml import etree if etl.IsManyData: tree = spider.GetHtmlTree(data[etl.Column]); nodes = tree.xpath(etl.XPath); for node in nodes: ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; ext['OHTML'] = ext['HTML'] yield extends.MergeQuery(ext, data, etl.NewColumn); else: tree = spider.GetHtmlTree(data[etl.Column]); nodes = tree.xpath(etl.XPath); data[etl.NewColumn] = nodes[0].text; yield data;
def transform(self, data): from lxml import etree if self.IsManyData: tree = spider.GetHtmlTree(data[self.Column]); nodes = tree.xpath(self.XPath); for node in nodes: ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; ext['OHTML'] = ext['HTML'] yield extends.MergeQuery(ext, data, self.NewColumn); else: tree = spider.GetHtmlTree(data[self.Column]); nodes = tree.xpath(self.XPath); node=nodes[0] if hasattr(node,'text'): setValue(data, self, node.text); else: setValue(data,self,str(node)) yield data;