def train(self): """Train the model using the known output for the given urls """ # rate xpaths by the similarity of their content with the output model = HtmlXpathSet() i = 0 while i < len(self._docs[0].outputs()): isGroup = False xpaths = [] if self._debug: print self._docs[0].outputs()[i] for doc in self._docs: if i < len(doc.outputs()): outputs = doc.outputs()[i] if isinstance(outputs, list): isGroup = True else: outputs = [outputs] for output in outputs: outputScores = defaultdict(int) for xpath, score in doc.matchXpaths(normalizeStr(output)): outputScores[xpath] += score # select best xpath match for each output bestScore = min([score for (xpath, score) in outputScores.items()]) if bestScore > 0: print "Warning: could not find '%s' (score=%d)" % (output, score) xpaths.extend([xpath for (xpath, score) in outputScores.items() if score == bestScore]) if self._debug: print pretty([(self._docs.index(doc), xpath, score) for (xpath, score) in outputScores.items() if score == bestScore]) if xpaths: if isGroup: model.append(self.abstractXpaths(xpaths)) else: model.append(tuple(self.rankXpaths(xpaths))) if self._debug: print 'Best:\n%s\n' % model[-1] i += 1 if self._attributes: self.addAttributes(model) return model
def getElementText(self, e): """Extract text under this HtmlElement """ return normalizeStr(e.text_content().strip())