コード例 #1
0
class CrawlTopMovie:

	def __init__(self,	start_num,	count_num):
		# Init function:
		# Add request Header
		# Set start&count number
		# Connect MySQL
		self.movie_url = "http://api.douban.com/v2/movie/top250?"
		self.header = {'User-Agent': 'Mozilla/5.0'}
		self.start = "start=" + str(start_num)
		self.count = "count=" + str(count_num)
		self.sql = MySql("Your MySQL setting")
		self.data = []

	def getDataFromUrl(self, url):
		# Get data
		request = urllib2.Request(url, None, self.header)
		data = urllib2.urlopen(request).read()
		return json.loads(data)

	def getSubjects(self):
		url = self.movie_url + self.start + "&" + self.count
		print url
		return self.getDataFromUrl(url)

	def crawlData(self):
		subject_data = self.getSubjects()
		if subject_data:
			for subject in subject_data["subjects"]:
				self.data = subject
				self.importToMySQL("movie")
			self.sql.quit()
		else:
			print "no subject"

	def	importToMySQL(self,	table):
		id = self.sql.getMaxID(table)
		self.sql.insert(table,
					(id,
						self.getTitle(),
						self.getMovieID(),
						self.getDoubanURL(),
						self.getRating(),
						self.getDirector(),
						self.getGenres(),
						self.getImage(),
						self.getDate()
					))

	def	getRating(self):
		return float(self.data["rating"]["average"])

	def	getDoubanURL(self):
		return self.data["alt"]

	def	getDirector(self):
		return self.data["directors"][0]["name"]

	def	getGenres(self):
		return str(self.data["genres"])

	def	getMovieID(self):
		return int(self.data["id"])

	def	getImage(self):
		return	self.data["images"]["large"]

	def	getTitle(self):
		return	self.data["title"]

	def	getDate(self):
		return	self.data["year"]
コード例 #2
0
class BookCrawl:

    """Crawl book data douban.com by API"""

    def __init__(self, book_num):
        self.douban_url = "https://api.douban.com/v2/book/"
        self.start_num = book_num[0]
        self.end_num = book_num[1]
        self.header = {'User-Agent': 'Mozilla/5.0'}
        self.sql = MySql("Your MySQL setting")
        self.data = []

    def __iter__(self):
        return self

    def next(self):
        if self.start_num < self.end_num:
            try:
                self.crawlData(self.douban_url + str(self.start_num))
            except:
                print self.start_num
            self.start_num += 1
            return self.start_num
        else:
            self.sql.quit()
        raise StopIteration()

    def test(self, url):
        self.data = self.getDataFromUrl(url)
        print self.getPrice()

    def crawlData(self, url):
        self.data = self.getDataFromUrl(url)
        self.insertBookInfo("bookinfo")
        self.insertBookTag("booktag")

    def getDataFromUrl(self, url):
        request = urllib2.Request(url, None, self.header)
        data = urllib2.urlopen(request).read()
        return eval(data)

    def getRating(self):
        return float(self.data["rating"]["average"])

    def getAuthor(self):
        if self.data["author"] == []:
            return ""
        else:
            return self.data["author"][0]

    def getPublishDate(self):
        date_str = self.data["pubdate"].split("-")
        date_format = ["%y", "%m", "%d"]
        format_type = "-".join(date_format[:len(date_str)])
        date = datetime.datetime.strptime(self.data["pubdate"][2:],
                                          format_type).date()
        return date

    def getImage(self):
        return self.data["image"]

    def getBookID(self):
        return int(self.data["id"])

    def getPublisher(self):
        return self.data["publisher"]

    def getTitle(self):
        return self.data["title"]

    def getUrl(self):
        return self.data["alt"]

    def getSummary(self):
        return self.data["summary"]

    def getPrice(self):
        price = self.data["price"]
        return float(re.search(r'\d+\.?\d*', price).group(0))

    def getBookTag(self):
        tags = self.data["tags"]
        if tags:
            return [item["name"] for item in tags]
        else:
            return []

    def insertBookInfo(self, table):
        id = self.sql.getMaxID(table)
        self.sql.insert(table,
                        (id,
                         self.getBookID(),
                         self.getTitle(),
                         self.getAuthor(),
                         self.getPublisher(),
                         self.getPublishDate(),
                         self.getPrice(),
                         self.getRating(),
                         1,
                         self.getSummary(),
                         self.getUrl(),
                         self.getImage())
                        )

    def insertBookTag(self, table):
        if self.getBookTag():
            for item in self.getBookTag():
                self.sql.insert(table,
                                (item,
                                 self.getBookID())
                                )