コード例 #1
0
def find_fld(soup, r, tipo=None, txt=None):
    rt = []
    for l in soup.findAll('legend'):
        if r.match(l.get_text()):
            f = l.parent
            if tipo == None:
                rt.append(f)
            elif tipo == 1 or tipo == 4:
                if f.div.find("p", text="Verdadero") and f.div.find(
                        "p", text="Falso"):
                    hjs = f.div.select(" > *")
                    if tipo == 4 or (len(hjs) > 2
                                     and util.sclean(hjs[1]) == "Verdadero"):
                        rt.append(f)
            elif tipo == 2:
                hjs = f.div.select(" > *")
                if len(hjs) > 1 and hjs[0].name == "p":
                    rt.append(f)
            elif tipo == 3:
                if f.div.p and f.div.p.get_text().strip() == txt:
                    rt.append(f)
            elif tipo == 5:
                hjs = f.div.select(" > *")
                cg = 0
                sg = 0
                for h in hjs:
                    if h.name != "p":
                        break
                    c = h.get_text().strip()
                    if c.startswith("- "):
                        cg = cg + 1
                    else:
                        sg = sg + 1
                        if sg > 1:
                            break
                if cg > 1 and sg == 1:
                    rt.append(f)
    return rt
コード例 #2
0
ファイル: join.py プロジェクト: santos82/LFS201
def find_fld(soup,r,tipo=None,txt=None):
	rt=[]
	for l in soup.findAll('legend'):
		if r.match(l.get_text()):
			f=l.parent
			if tipo==None:
				rt.append(f)
			elif tipo==1 or tipo==4:
				if f.div.find("p",text="Verdadero") and f.div.find("p",text="Falso"):
					hjs=f.div.select(" > *")
					if tipo==4 or (len(hjs)>2 and util.sclean(hjs[1])=="Verdadero"):
						rt.append(f)
			elif tipo==2:
				hjs=f.div.select(" > *")
				if len(hjs)>1 and hjs[0].name=="p":
					rt.append(f)
			elif tipo==3:
				if f.div.p and f.div.p.get_text().strip()==txt:
					rt.append(f)
			elif tipo==5:
				hjs=f.div.select(" > *")
				cg=0
				sg=0
				for h in hjs:
					if h.name!="p":
						break
					c=h.get_text().strip()
					if c.startswith("- "):
						cg=cg+1
					else:
						sg=sg+1
						if sg>1:
							break
				if cg>1 and sg==1:
					rt.append(f)
	return rt
コード例 #3
0
	fs=find_fld(d, cono, 1)
	if len(fs)>1:
		v1=fs[0]
		del fs[0]
		for v in fs:
			dv=v.div
			v1.append(dv)
			dv.unwrap()
			v.extract()

	fs=find_fld(d, cono, 4)
	for v1 in fs:
		uls=[]
		ul=soup.new_tag("ul")
		for p in v1.findAll("p"):
			if p.next_sibling and p.next_sibling.name=="p" and util.sclean(p.next_sibling)=="Verdadero":
				p.name="li"
				c=p.contents[0]
				if isinstance(c, bs4.NavigableString) or isinstance(c, unicode):
					cs=c.strip()
					if len(cs)>1 and cs[0].isdigit() and cs[1]==".":
						c.extract()
						ul.name="ol"
						if cs[0]=="1" and len(ul.contents)>0:
							uls.append(ul)
							ul=soup.new_tag("ul")
				ul.append(p)
			elif util.sclean(p)=="Verdadero" or util.sclean(p)=="Falso":
				p.extract()
		p=soup.new_tag("p")
		p.string=preguntaVF
コード例 #4
0
ファイル: clean.py プロジェクト: stefanlasiewski/lfs-crawler
    ttxt = soup.select("div.ttxt")
    for t in ttxt:
        t.replaceWithChildren()

    comments = soup.findAll(text=lambda text: isinstance(text, bs4.Comment))
    for n in comments:
        n.extract()

    tags = util.vacio(soup, ['table', 'p', 'div', 'ul', 'ol', 'li'])
    for t in tags:
        t.extract()

    spans = soup.select("span")
    for s in spans:
        txt = util.sclean(s)
        if len(txt) == 0 or txt == ":":
            s.unwrap()
        elif 'style' not in s.attrs:
            s.unwrap()
        elif "rgb(0, 150, 200)" in s.attrs[
                'style'] and s.parent.name != "a" and not (
                    len(s.select(" > *")) == 1
                    and s.select(" > *")[0].name == "a"):
            s.attrs['class'] = "enlace"
        elif util.has(s.attrs['style'], [
                "rgb(0, 0, 255)", "rgb(0, 0, 205)", "color:#0000CD",
                "rgb(41, 1, 208)", "color:#0000FF"
        ]):
            s.attrs['class'] = "comando"
        elif util.has(s.attrs['style'],