def removeListsWithLinks(self, doc): items=Parser.getElementsByTags(doc, ('ol','ul')) for item in items: fa = 0 for li in item: if Parser.hasChildTag(li, 'a'): fa += 1 if fa > 2: parent = item.getparent() Parser.remove(item) if parent is not None: if len(parent) == 0 or len(Parser.getText(parent).split()) < 4: Parser.remove(parent) break else: fa = 0 items=Parser.getElementsByTag(doc, tag='a') for a in items: e = a.getparent() if e is None: continue text = Parser.getText(e) ldels = [] textcount = 0 for link in e: ltext = Parser.getText(link) if link.tag != 'a' and len(ltext) <= 2: continue if link.tag != 'a' and len(ltext) > 2: ldels = [] break if ltext == '': continue ldel = text.split(ltext,1) ld = ldel[0].strip() ldels.append(ld) if len(ldel) == 1: break text = ldel[1] if len(ldels) == 0 or ldels[0] == ',': continue else: del ldels[0] flag = 0; flag1 = 0; flag2 = 0; flag3 = 0 for ldel in ldels: if ldel == ldels[0]: flag += 1 if len(ldel) > 3 or ',' in ldel: flag1 = 1 if ldel != '': flag2 = 1 if len(ldel) > 1: flag3 = 1 if flag2 == 0 and len(ldels) > 1: Parser.remove(e) continue if len(ldels) == 2 and ldels[0] == '|' and ldels[1] == '|': Parser.remove(e) continue if len(ldels) > 3 and flag3 == 0: Parser.remove(e) continue if (flag <= 2 and len(ldels) <= 2) or flag1 != 0: continue Parser.remove(e) return doc
def isTableTagAndNoParagraphsExist(self, e): return False subParagraphs = Parser.getElementsByTag(e, tag='p') for p in subParagraphs: txt = Parser.getText(p) if len(txt) < 25: Parser.remove(p) if not Parser.hasChildTag(e, 'p') and e.tag is not "td": return True return False
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag in ('p','h2','h3','h4') and len(Parser.getText(currentSibling)) > 0: return [currentSibling] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: path = Parser.getPath(firstParagraph) textLen,stopCount,isHighLink = self.getTextStats(firstParagraph) if path == good_path and not Parser.hasChildTag(firstParagraph, 'a'): ps.insert(0,firstParagraph) continue if textLen > 0: score = float(baselineScoreForSiblingParagraphs * 0.30) if score < stopCount and not isHighLink: ps.insert(0,firstParagraph) return ps