コード例 #1
0
ファイル: complex_HTML.py プロジェクト: goodspeedj/csci-e64
                
                type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n'))
                
                
                if ((type != ' \n') and not (re.match('^\s+', type))):
                    genre.append(type)
            
            
            genresStr = ';'.join(genre)
            
            
        #=======================================================================
        # Get the directors
        #=======================================================================
        directors = []
        for movie in movieDom.by_attribute(itemprop="director"):
            
            # Get rid of the html tags
            dir = re.sub('<[a-zA-Z\/][^>]*>','', movie.content.encode('ascii','ignore').lstrip('\r\n'))
            
            # Get rid of new line
            dirs = re.sub('\n', '', dir)
            
            # Directors for other movies have leading spaces - don't add them
            if not re.match('^\s+', dirs):
                directors.append(dirs)
        
        directorsStr = ';'.join(directors)


        #=======================================================================
コード例 #2
0
ファイル: scrape.py プロジェクト: dicai/datavis
def get_info(baseurl, out_filename, npages=200):

    output = open(out_filename, "w")
    w = writer.UnicodeWriter(output)
    # TODO: fix this header
    w.writerow(
        [
            "Title",
            "Rating",
            "Calories (kcal)",
            "Cholesterol (mg)",
            "Fat (g)",
            "Protein (g)",
            "Fiber (g)",
            "Sodium (mg)",
            "Cook Time",
            "Ingredients",
            "Full Ingredients",
        ]
    )

    for page in range(1, npages):
        try:
            url = URL(baseurl + "?Page=%d" % page)
            dom = DOM(url.download(cached=True))
            links = dom.by_class("rectitlediv")

            # goes through the 20 recipes on a given page
            for index in range(len(links)):
                # print index
                # get the link name
                title = links[index].content.split("/recipe/")[1].split("/detail")[0]
                # download individual recipe
                rpage = URL(os.path.join(base, title, end))
                pdom = DOM(rpage.download(cached=True))

                # average rating value
                rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3]

                # list of nutrition elements
                nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList")
                nut_vals = []
                for i in range(len(nut_list)):
                    val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content
                    nut_vals.append(val)
                nuts = "\t".join(nut_vals)

                # time needed to cook
                try:
                    cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content
                    cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_hours = "0"
                try:
                    cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content
                    cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_mins = "0"
                mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0]))

                # ingredients

                ## gets the block containing both the amount and the amount
                all_ings = pdom.by_attribute(itemprop="ingredients")
                ing_units = []
                ing_vals = []
                for ing_index in range(len(all_ings)):
                    tmp_ing = all_ings[ing_index].by_id("lblIngName").content
                    if "&nbsp;" in all_ings[ing_index].content:
                        continue
                    try:
                        tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content
                    except:
                        tmp_amount = ""  # LET THIS BE THE EMPTY CHAR we decide on
                    ing_units.append(tmp_amount)
                    ing_vals.append(tmp_ing)
                ings = ";".join(ing_vals)

                ing_units = [x + "|" for x in ing_units]
                str_ings = [str(x) for x in zip(ing_units, ing_vals)]
                str_ings = [x.replace(",", " ") for x in str_ings]
                full_ings = ";".join(str_ings)
                full_ings = (
                    full_ings.replace("u'", "")
                    .replace("'", "")
                    .replace(", u", "")
                    .replace("(", "")
                    .replace(")", "")
                    .replace("  ", " ")
                )

                assert len(ing_vals) == len(ing_units)

                w.writerow([title, rating, nuts, mins, ings, full_ings])

        except:
            pass

    output.close()
コード例 #3
0
ファイル: complex_HTML.py プロジェクト: aeggermont/cs171
def get_title_attributes(title, titleLink):

    url = URL(titleLink)
    dom = DOM(url.download(cached=True))
    titleObj = Title(title.encode('ascii','replace'))

    print "Movie: ", title

    # Get Directors
    print "-> About to print directors... "

    directors = dom.by_attribute(itemprop="director")[0]
    directorNames =  directors.by_tag("a")


    for director in directorNames:
        print director.content

        dirName  = unicodedata.normalize('NFD', director.content).encode('ascii','replace')
        #str(director.content).encode("utf-8")
        print "Director ===> ", dirName

        titleObj.addDirectors( dirName )

    # Get writers
    print "-> About to print writers... "

    try:
        writers = dom.by_attribute(itemprop="writer")
        for writer in writers:
            # print writer[1][1].content
            titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace'))
    except:
        pass



    print "--> About to get actors... "
    try:
        actors = dom.by_attribute(itemprop="actors" )
        for actor in actors:
            # print actor[1][1].content
            titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace'))
    except:
        pass


    print "--> Aboutb to get rating information... "


    try:
        ratingsInfo = dom.by_class("star-box-giga-star")

        for rating in ratingsInfo:
            # print rating.content
            titleObj.addRating(str(rating.content).encode('ascii', 'replace'))
    except:
        pass


    print "--> About to print other stuff...  "



    for item in dom.by_class("infobar"):

        try:
            objMatch = re.search("(\d+)", item.by_tag("time")[0].content )

            if objMatch:
                # print objMatch.group(1)
                titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace'))
        except:
            pass



        for genreItem in item.by_tag("a"):

            try:
                objMatch = re.search("genre", genreItem.attributes['href'] )

                if objMatch:
                    titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace'))
                    # print genreItem.attributes['href']
                    # print genreItem.content
            except:
                pass


    return  titleObj