def __init__(self, title, url, date, tag, keyword): super(Article, self).__init__() self._id = gethash(title + date + keyword) self.title = title self.url = url self.date = self._cleandate(date) self.datenum = date2num(date) self.tag = tag self.keyword = keyword
def __init__(self, start, end, keyword, total): super(Phase, self).__init__() self.keyword = keyword self.total = total self.pages = int(math.ceil(float(total)/15)) self.start = date2num(start) self.end = date2num(end) self.year = self.start / 10000 self.month = self.start % 10000 / 100 self._id = gethash(str(self.year) + str(self.month) + str(keyword))
def _tmpfilename(self, url): if not os.path.exists(self.args['path']): os.makedirs(self.args['path']) return self.args['path'] + str(gethash(url)) + '.html'
def test__tmpfilename(self): h = Handler.get('TempHandler')(SpiderTest('testspider')) self.assertEqual('./tmp/testspider/' + str(gethash('sample')) + '.html', h._tmpfilename('sample')) self.assertTrue(os.path.exists('./tmp/'))