def parsefile(fn): t = TreeScraper(fn) res = [] tm = {"file": fn, "date": datex(fn)} for table in t.select("//table"): tm.update(table.extract(fund="../preceding-sibling::p[3]//i//text()")) if "fund" in tm: tm["fund"] = strippct("".join(tm["fund"])) for row in table.select("tr"): rcat = row.extract(category="td//b//text()") if "category" in rcat: rcat["category"] = strippct(sharename(rcat["category"])) tm.update(rcat) else: r = row.extract(share="td[1]//text()[2]", price="td[8]//text()", number="td[4]//text()") r.update(tm) if ("price" in r) and ("number" in r) and ( "share" in r ): # and ("".join(r["category"]).find("GERMANY")>-1): r["price"] = sharename(r["price"][0].replace(",", "")) r["number"] = sharename(r["number"][0].replace(",", "")) r["share"] = sharename(r["share"][0]) res.append(r) return res
def parsefile(fn): t = TreeScraper(fn) res = [] tm = {"file": os.path.split(fn)[1], "date": datex(fn)} nof = t.extract(fund="//*[contains(text(),'Name of Fund:')]/text()") if "fund" in nof: nof["fund"] = stripname(nof["fund"]) tm.update(nof) for table in t.select("//table"): tm.update(table.extract(fund="../preceding-sibling::p[3]//i//text()")) if "fund" in tm: tm["fund"] = sharename(strippct("".join(tm["fund"]))) for row in table.select("tr"): rcat = row.extract(category="td//b//text()") if "category" in rcat: rcat["category"] = strippct(sharename(rcat["category"])) tm.update(rcat) else: r = row.extract(share="td[1]//text()[2]", price="td[8]//text()", number="td[4]//text()") r.update(tm) if ( ("price" in r) and ("number" in r) and ("share" in r) ): # and ("".join(r["category"]).find("GERMANY")>-1): try: r["price"] = sharename(r["price"].replace(",", "")) r["number"] = sharename(r["number"].replace(",", "")) r["share"] = sharename(r["share"]) res.append(r) except Exception, e: logger.debug("%s -%s" % (e, pprint.pformat(r)))
def parsefile(fn): t = TreeScraper(fn) res = [] tm = {"file": os.path.split(fn)[1], "date": datex(fn)} nof = t.extract(fund="//*[contains(text(),'Name of Fund:')]/text()") if "fund" in nof: nof["fund"] = stripname(nof["fund"]) tm.update(nof) for table in t.select("//table"): tm.update(table.extract(fund="../preceding-sibling::p[3]//i//text()")) if "fund" in tm: tm["fund"] = sharename(strippct("".join(tm["fund"]))) for row in table.select("tr"): rcat = row.extract(category="td//b//text()") if "category" in rcat: rcat["category"] = strippct(sharename(rcat["category"])) tm.update(rcat) else: r = row.extract(share="td[1]//text()[2]", price="td[8]//text()", number="td[4]//text()") r.update(tm) if ("price" in r) and ("number" in r) and ( "share" in r ): # and ("".join(r["category"]).find("GERMANY")>-1): try: r["price"] = sharename(r["price"].replace(",", "")) r["number"] = sharename(r["number"].replace(",", "")) r["share"] = sharename(r["share"]) res.append(r) except Exception, e: logger.debug("%s -%s" % (e, pprint.pformat(r)))
def parsefile(fn) : t=TreeScraper(fn) res=[] tm={"file" : os.path.split(fn)[1], "date" : datex(fn)} nof=t.extract(fund="//*[contains(text(),'Name of Fund:')]/text()") if "fund" in nof : nof["fund"]=stripname(nof["fund"]) tm.update(nof) for table in t.select("//table"): for tg in ("../preceding-sibling::p[3]//i//text()", "../preceding-sibling::table//tr[3]//td[2]//b//text()[1]", "./preceding-sibling::table[1]/tr[2]/td[3]//b/text()[1]", "./preceding-sibling::table[1]/tr[3]/td[3]//b/text()[1]", "./preceding-sibling::p[3]//i//text()", ) : tff=table.extract(fund=tg) if "fund" in tff : logger.debug("%s fund: %s" % (tg,repr(tff["fund"]))) tm["fund"]=sharename(strippct("".join(tff["fund"]))) break if "fund" in tm : tm["fund"]=sharename(strippct("".join(tm["fund"]))) for row in table.select("tr"): rcat=row.extract(category="td[1]//b//text()") if "category" in rcat : rcat["category"]=strippct(sharename(rcat["category"])) tm.update(rcat) else : r=row.extract(share="td[1]//text()[2]", left="td[1]//p[contains(@style,'margin-left:1.00em;')]//text()", indented="td[1]//p[contains(@style,'margin-left:3.00em;')]//text()", price="td[8]//text()", price2="td[7]//text()", number="td[4]//text()") #logger.debug(repr(r)) if "left" in r : tm["left"]="".join(r["left"]) r.update(tm) if ("price" in r) and ("number" in r) and ("share" in r) : # and ("".join(r["category"]).find("GERMANY")>-1): try : for lt in ("price","number","price2") : if type(r[lt])==type([]): r[lt]="".join(r[lt]) r["price"]=sharename(r["price"].replace(",","")) r["price2"]=sharename(r["price2"].replace(",","")) if r["price"]=="" : r["price"]=r["price2"] r["number"]=sharename(r["number"].replace(",","")) if "indented" in r : r["share"]=sharename(tm["left"]+" "+r["share"]) else : r["share"]=sharename(r["share"]) if (r["price"]>"") and (r["number"]>""): res.append(r) except Exception,e : logger.debug("%s -%s" % (e,pprint.pformat(r)))
def parsefile(fn) : t=TreeScraper(fn) res=[] tm={"file" : fn, "date" : datex(fn)} for table in t.select("//table"): tm.update(table.extract(fund="../preceding-sibling::p[3]//i//text()")) if "fund" in tm : tm["fund"]=sharename("".join(tm["fund"])) for row in table.select("tr"): rcat=row.extract(category="td//b//text()") if "category" in rcat : rcat["category"]=sharename("".join(rcat["category"])) tm.update(rcat) else : r=row.extract(share="td[1]//text()[2]", price="td[8]//text()", number="td[4]//text()") r.update(tm) if ("price" in r) and ("number" in r) and ("share" in r) : # and ("".join(r["category"]).find("GERMANY")>-1): r["price"]=sharename(r["price"][0].replace(",","")) r["number"]=sharename(r["number"][0].replace(",","")) r["share"]=sharename(r["share"][0]) res.append(r) return res
def parsefile(fn) : t=TreeScraper(fn) res=[] tm={"file" : os.path.split(fn)[1], "date" : datex(fn)} nof=t.extract(fund="//*[contains(text(),'Name of Fund:')]/text()") if "fund" in nof : nof["fund"]=stripname(nof["fund"]) tm.update(nof) for table in t.select("//table"): for tg in ("../preceding-sibling::p[3]//i//text()", "../preceding-sibling::table//tr[3]//td[2]//b//text()[1]", "./preceding-sibling::table[1]/tr[2]/td[3]//b/text()[1]", "./preceding-sibling::table[1]/tr[3]/td[3]//b/text()[1]", "../preceding-sibling::div[1]//table[1]/tr[2]/td[contains(@style,'text-align: right')]//text()[1]", "./preceding-sibling::p[3]//i//text()", ) : tff=table.extract(fund=tg) if "fund" in tff : logger.debug("%s fund: %s" % (tg,repr(tff["fund"]))) tm["fund"]=sharename(strippct("".join(tff["fund"]))) break if "fund" in tm : tm["fund"]=sharename(strippct("".join(tm["fund"]))) tm["category"]="" tm["akku"]=[] for row in table.select("tr"): rcat=row.extract(category=".//td[contains(@colspan,'9')]//text()") if "category" in rcat : logger.debug("Category: %s" % repr(rcat)) rcat["category"]=strippct(sharename(rcat["category"])) tm.update(rcat) else : r=row.extract(share="td[1]//text()", left="td[1][starts-with(@style,'text-align: left')]/text()", indent="td[1][contains(@style,'text-indent:')]/text()", number="td[2]//text() | td[3]//text() | td[4]//text() | td[5]//text() | td[6]//text() | td[7]//text() | td[8]//text()", ) if ("share" in r) and (not "left" in r) and (not "indent" in r) : tm["category"]=strippct(sharename(r["share"])) if "number" in r : r["number"]=filter(lambda a: re.match(r"\s*\(?\d",a),r["number"]) if len(r["number"])==2 : r["price"]=r["number"][1] r["number"]=r["number"][0] if "number" in r : if r["number"]==[] : if "indent" in r : tm["akku"].append("".join(r["share"])) else : if "share" in r : tm["akku"]=["".join(r["share"]),] else : if tm["akku"] : if "indent" in r : r["share"]="%s %s" % (" ".join(tm["akku"]),r["share"]) ta=tm["akku"][0] tm["akku"]=[] tm["akku"].append(ta) logger.debug(repr(r)) r.update(tm) if ("price" in r) and ("number" in r) and ("share" in r) : # and ("".join(r["category"]).find("GERMANY")>-1): try : for lt in ("price","number") : if type(r[lt])==type([]): r[lt]="".join(r[lt]) r["price"]=sharename(r["price"].replace(",","")) r["number"]=sharename(r["number"].replace(",","")) if "indented" in r : r["share"]=sharename(tm["left"]+" "+r["share"]) else : r["share"]=sharename(r["share"]) if (r["price"]>"") and (r["number"]>""): res.append(r) except Exception,e : logger.debug("%s -%s" % (e,pprint.pformat(r)))
def parsefile(fn): t = TreeScraper(fn) res = [] tm = {"file": os.path.split(fn)[1], "date": datex(fn)} nof = t.extract(fund="//*[contains(text(),'Name of Fund:')]/text()") if "fund" in nof: nof["fund"] = stripname(nof["fund"]) tm.update(nof) for table in t.select("//table"): for tg in ( "../preceding-sibling::p[3]//i//text()", "../preceding-sibling::table//tr[3]//td[2]//b//text()[1]", "./preceding-sibling::table[1]/tr[2]/td[3]//b/text()[1]", "./preceding-sibling::table[1]/tr[3]/td[3]//b/text()[1]", "./preceding-sibling::p[3]//i//text()", ): tff = table.extract(fund=tg) if "fund" in tff: logger.debug("%s fund: %s" % (tg, repr(tff["fund"]))) tm["fund"] = sharename(strippct("".join(tff["fund"]))) break if "fund" in tm: tm["fund"] = sharename(strippct("".join(tm["fund"]))) for row in table.select("tr"): rcat = row.extract(category="td[1]//b//text()") if "category" in rcat: rcat["category"] = strippct(sharename(rcat["category"])) tm.update(rcat) else: r = row.extract( share="td[1]//text()[2]", left= "td[1]//p[contains(@style,'margin-left:1.00em;')]//text()", indented= "td[1]//p[contains(@style,'margin-left:3.00em;')]//text()", price="td[8]//text()", price2="td[7]//text()", number="td[4]//text()") #logger.debug(repr(r)) if "left" in r: tm["left"] = "".join(r["left"]) r.update(tm) if ("price" in r) and ("number" in r) and ( "share" in r ): # and ("".join(r["category"]).find("GERMANY")>-1): try: for lt in ("price", "number", "price2"): if type(r[lt]) == type([]): r[lt] = "".join(r[lt]) r["price"] = sharename(r["price"].replace(",", "")) r["price2"] = sharename(r["price2"].replace(",", "")) if r["price"] == "": r["price"] = r["price2"] r["number"] = sharename(r["number"].replace(",", "")) if "indented" in r: r["share"] = sharename(tm["left"] + " " + r["share"]) else: r["share"] = sharename(r["share"]) if (r["price"] > "") and (r["number"] > ""): res.append(r) except Exception, e: logger.debug("%s -%s" % (e, pprint.pformat(r)))