def from_html_productlist_common(self, webfragment, grupp, best = 0): assert self.state == NEW m = MSet([("varunr", MS(r'p_varunr=([0-9]+)')), ("namn", MS(r'(?s)>(.*?)</a>')), ("årgång", MSDeH(r'(?s)<td.*?>(.*?)</td>')), ("varunr2", MSDeH(r'(?s)<td.*?>(.*?)</td>')), ("land", MSDeH(r'(?s)<td.*?>(.*?)</td>')), ("förpdata", M()), ]) dict = m.get(webfragment) self.grupp = grupp self.varunr = dict.get("varunr") self.namn = dict.get("namn") self.ursprung = dict.get("land") self.argang = dict.get("årgång","") # Earlier, we split on products and extracted a whole list of containers # from the product entry. Now, we are forced to parse each container as a separate # product, and merge them in the ProductList class. self.forpackningar = [] fd = dict["förpdata"] c = Container().from_html_productlist(fd, best) self.forpackningar.append(c) if self.namn and self.varunr: self.state = VALID else: self.state = INVALID return self
def from_html_name(self, webpage): assert self.state == NEW typlista = MList(r'<table width="640" border="0" cellspacing="0" cellpadding="0">', MSet([("typrubrik", MSDeH(r'(?s)<font class="rubrik2">(.*?)</font>')), ("prodlista", MList(r'<td width="290" align=', M())), ])).get(webpage) self.lista = [] for t in typlista: grupp = t["typrubrik"] for p in t["prodlista"]: prod = Product().from_html_name(p, grupp) if prod.valid(): # A real product self.lista.append(prod) else: # This should be a dummy product with a container # to be added to the last real product self.lista[-1].add_containers_from(prod) if self.lista: self.state = VALID else: self.state = INVALID return self
def from_html(self, webfragment, lan = None): assert self.state == NEW dict = MSet([("kod", MS(r'butiknr=([0-9]+)')), ("ort", MS(r'>(.*?)</a>')), ("adress", MS(r'<td[^>]*>(.*?)</td>')), ("telefon", MS(r'<td[^>]*>(.*?)</td>')), ]).get(webfragment) self.lan = lan self.kod = dict.get("kod") self.ort = dict.get("ort") self.adress = dict.get("adress") self.telefon = dict.get("telefon") assert self.kod and self.ort and self.adress self.state = VALID return self
def from_html_normal(self, webpage): assert self.state == NEW m = MSet([("grupp", MSDeH(r'(?s)<td.*?class="text10pxfetvit">(.*?)</td>')), ("namn", MSDeH(r'(?s)<span class="rubrikstor">(.*?)\(nr')), ("varunr", MS(r'(?s)\(nr.*?([0-9]+)')), ("land", MSDeH(r'ursprung=.*?>(.*?)<')), ("distrikt", MSDeH(r'(?s)Distrikt</td>(.*?)</td>')), ("alkoholhalt", MSDeH(r'(?s)Alkoholhalt</td>(.*?)</td>')), ("farg", MSFargDoftSmak("Färg")), ("doft", MSFargDoftSmak("Doft")), ("smak", MSFargDoftSmak("Smak")), ("anvandning", MSFargDoftSmak("Användning")), ("sotma", MSC("Sötma", advance = 0)), ("fyllighet", MSC("Fyllighet", advance = 0)), ("stravhet", MSC("Strävhet", advance = 0)), ("fruktsyra", MSC("Fruktsyra", advance = 0)), ("beska", MSC("Beska", advance = 0)), ("lagring", MSDeH(r'(?s)llbarhet.*?</td>(.*?)</td>')), ("druvsorter", MSDeH(r'(?s)Druvsorter.*?</td>(.*?)</td>')), ("argang", MSDeH(r'(?s)Provad.*?årgång.*?</td>(.*?)</td>')), ("provningsdatum", MSDeH(r'(?s)Provningsdatum.*?</td>(.*?)</td>')), ("producent", MSDeH(r'(?s)Producent.*?</td>(.*?)</td>')), ]) m.get_into_object(webpage, self) # for k,v in sorted(self.__dict__.items()): print "%-16s = %s" % (k,v) if self.namn and self.varunr: self.namn = self.namn.strip() self.state = VALID else: self.state = INVALID return self self.forpackningar = [] for f in MLimit(r'(?s)Info</td>(.*?)</table>', \ MList(r'<td class="text_tabell" valign="Middle"', M())).get(webpage): c = Container().from_html_normal(f) self.forpackningar.append(c) return self
def from_html(self, webpage): assert self.state == NEW for d in MLimit(r'(?s)<select name="p_druva" class="selectDruva">(.*?)</select>', MList("<option", MSet([("nr", M('option value="([0-9]+)"')), ("namn", M(">(.*)</option>")), ]))).get(webpage): if d["namn"] <> "-": self.druvor.append((int(d["nr"]), d["namn"])) self.state = VALID return self
def from_html_normal(self, webfragment): assert self.state == NEW # We use this instead of inline field = Mfoo(...).get(webfragment) # as we believe the matches below need to be sequenced # just the way MSet does. MSet([("namn", MS(r';">(.*?)</td>')), ("storlek", MS(r';">(.*?)</td>')), ("pris", MS(r'<td class="text10pxfet".*?>([0-9.]+)')), ("anm", MSDeH(r';">(.*?)</td>')), ]).get_into_object(webfragment,self) self.sortiment = "?" assert self.namn and self.storlek and self.pris self.pris = self.pris + " kr" self.state = VALID return self
def from_html_stores(self, webpage, lan, ort): if lan <> "99": # Ett enda län self.butiker = [] for b in MList(r'<tr><td width="200" valign=top>', M()).get(webpage): s = Store().from_html(b) if s.matches_ort(ort): self.butiker.append(s) else: # En lista av län lista = MList("<H4>", MSet([("län", MS(r'<H4>(.*?)</H4>')), ("butikslista", MList(r'<tr><td width="200" valign=top>', M()))])).get(webpage) self.butiker = [] for l in lista: lan = l["län"] for b in l["butikslista"]: s = Store().from_html(b, lan) if s.matches_ort(ort): self.butiker.append(s)
def from_html_productlist(self, webfragment, best = 0): assert self.state == NEW dict = MSet([("volym", MSVolym(r'(?s)<td.*?>(.*?)</td>')), ("pris", MS(r'(?s)<td.*?>(.*?)</td>')), ("allabutiker", MS(r'(Finns i alla butiker)')), ("bestsort", MS(r'(Beställningsvara)')), ]).get(webfragment) self.namn = None self.storlek = dict.get("volym") self.pris = dict.get("pris") self.anm1 = None self.anm2 = None if dict.has_key("allabutiker"): self.sortiment = "alla" elif best or dict.has_key("bestsort"): self.sortiment = "best" else: self.sortiment = "" assert self.storlek and self.pris self.state = VALID return self
def split_area_phone(str): m = re.match("^([0-9]+)-([0-9]+)$", str) if m: return m.group(1, 2) else: return None # Search class search_m = MSet([ ("resultat", MList( '<TD align="left" bgcolor="#CCCCCC">', MSet([ ("name_title", MSDeH(r"<(?s).*<b>(.*?)</b>")), ("address", MSDeH(r'(?s)<TD align="left">(.*?)</TD>')), ("phone", MSDeH(r'(?s)<TD align="right" width="100">(.*?)</B>')), ]))), ("maxvisas", MS(r">1-([0-9]+)")), ("totalt", MS(r"(?s).*visas av totalt:.*?([0-9]+)")) ]) class Search: def __init__(self, webpage): (self.dict, pos) = search_m.match(webpage, 0, len(webpage)) def valid(self): return self.dict.has_key("resultat") def to_string(self):