Ejemplo n.º 1
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean=ean))
    
    #Check if something was found
    if "Ihre Suche ergab leider keine Treffer" in page.text:
        return None

    html = lxml.html.document_fromstring(page.text)
    result = defaultdict()

    transform = list()

    #Check media type
    result["type"] = html.find('.//span[@class="noCategory"]').text_content().strip()

    resolve_author = lambda: defNone(html.find('.//span[@class="oAuthorLinked"]'), lambda x: x.text_content()) 
    if result["type"].startswith("Buch"):
        result["type"] = "book"
        result["author"] = resolve_author()
        result["artists"] = None
    elif result["type"] == "Hörbuch":
        result["type"] = "audiobook"
        result["author"] = resolve_author()
        result["artists"] = None
    else:
        result["type"] = "movie"
        result["artists"] = [elm.text for elm in html.findall('.//span[@class="oAuthorLinked"]/a')]
        result["author"] = None

    #Extract simple attributes from the head of the page
    result["title"] = html.find('.//span[@class="oProductTitle"]').text.strip()
    result["imgurl"] = html.find('.//img[@id="elevateZoom"]').attrib["src"]

    result["description"] = defNone(html.find('.//dd[@class="cTypeBeschreibung"]'), lambda x: x.text_content().strip())

    #Extract attributes of the dd/dt Table next to the article picture
    attr_container = html.find('.//dl[@class="dlCols30_70"]')

    attr_list = dict()
    for elm in attr_container.getchildren():
        if elm.tag == "dt":
            curName = elm.text.strip()
        if elm.tag == "dd":
            attr_list[curName] = elm.text_content().strip()

    result["duration"] = defNone(attr_list.get("Spieldauer"), lambda x:int(x.replace("Minuten", "")))

    result["studio"] = attr_list.get("Studio")
    result["genre"] = attr_list.get("Genre") 
    import locale
    oldlocale = locale.getlocale(locale.LC_TIME)
    locale.setlocale(locale.LC_TIME, "de_DE.utf8")
    result["created"] = defNone(attr_list.get("Erscheinungsdatum"), lambda x: interpDate(x))
    locale.setlocale(locale.LC_TIME, oldlocale)

    return result 
Ejemplo n.º 2
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean))
    html = lxml.html.document_fromstring(page.text)

    #Jump further
    further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"]
    
    page = requests.get(further_url)
    html = lxml.html.document_fromstring(page.text)
    result = dict()
    result["title"] = html.find('.//h1/span[@class="loud"]').text_content()
    result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()]
    result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] 

    attribs = dict()

    for i in html.findall(".//ul[@id='main-info-facts']/li"):
        name, sep, val = i.text_content().strip().partition(":")
        attribs[name] = val

    result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y"))
    result["author"] = None
    result["artists"] = None
    result["description"] = None
    result["duration"] = None
    
    return result
Ejemplo n.º 3
0
def resolve_ean(ean):
    page = requests.post(SEARCH_URL, data={"form[q]": ean})

    #Check if something was found
    if "keine Artikel gefunden" in page.text:
        return None

    html = lxml.html.document_fromstring(page.text)
    result = dict()

    result["type"] = html.find('.//li[@class="variant"]').text_content().strip()
    if result["type"] == "Audio CD":
        result["type"] = "audiobook"
        result["author"] = html.find('.//a[@class="author"]').text_content().strip()
        result["artists"] = None
    elif result["type"] == "Gebundenes Buch":
        result["type"] = "book"
        result["author"] = html.find('.//a[@class="author"]').text_content().strip()
        result["artists"] = None
    else:
        result["artists"] = result["author"] = None
        result["type"] = "movie"


    result["title"] = html.find('.//h1[@class="headline"]').text
    attr_field = html.find('.//ul[@class="plain"]')
    attrs = dict()
    for li in attr_field.findall(".//li"):
        data = li.text_content()
        if data:
            title, sep, val = data.partition(":") 
            attrs[title] = val.strip()
    #Extract description
    description_element = html.find('.//div[@class="product-description"]/div[2]/div[1]')

    #Convert brs to nl
    if description_element is not None:
        for br in description_element.xpath(".//br"):
            br.tail = "\n" + br.tail if br.tail else "\n"
        description = description_element.text_content()

        #Strip trailing crap
        result["description"] = description[:description.find("Bonusmaterial")]
    else:
        #Ignore this hit if there is no description
        return None

    try:
        result["duration"] = int(re.search("Gesamtlaufzeit: (\d+) Min.", page.text).group(1))
    except:
        result["duration"] = None

    result["created"] = defNone(attrs.get("Erscheinungstermin"), lambda x: interpDate(x)) 
    result["studio"] = attrs.get("Hersteller")

    result["imgurl"] = html.find('.//img[@class="cover"]').attrib["src"]

    return result 
Ejemplo n.º 4
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean))
    html = lxml.html.document_fromstring(page.text)
    
    result = dict()
    title_elm = html.find(".//span[@itemprop='name']")

    #When the title is not found on the page, the product seems to be in the unsorted section of geizhals...
    if title_elm is None:
        return None

    result["title"] = title_elm.text_content()
    result["genre"] = html.find(".//li[@class='ghnavhi']").text_content()
    description = html.find(".//div[@id='gh_proddesc']").text_content()
    result["firstrelease"] = defNone(re.search("Ersterscheinung: (\d+)", description), lambda x: x.group(1))

    for i in html.findall(".//a[@class='revlink']"):
        if "imdb" in i.attrib["href"]:
            result["imdb_link"] = i.attrib["href"]
            break;

    return result