class Docs(): """docstring for Docs""" def __init__(self): logging.info('conduct docs firstly') self.redis = RedisHelper() def conductDocs(self): fileWtriter = file(PATH_DOC_AUTHOR, 'w') authorList = self.redis.getAuthorList() authorDoc = dict() # year-->docs. the docs of an author in every year index = 0 for author in authorList: authorDoc = {} papers = self.redis.getAuthorPapers(author) for paper in papers: year = self.redis.getPaperYear(paper) if int(year) <= TEST_DATA_YEAR: # we only use the data in ten years content = self.redis.getPaperAbstract(paper) if len(content) < 3: # if there is no abstract,return title content = self.redis.getPaperTitle(paper) doc = authorDoc.setdefault(year, "") authorDoc[year] = doc + content for year, doc in authorDoc.items(): if index % 10000 == 0: print index fileWtriter.write(doc + '\n') self.redis.addDocAuthorYear(index, author, year) index += 1 fileWtriter.close()