def __init__(self): self.news = set() self.updatedNews = set() self.newsList = [] f = open(FILE_NAME) for line in f: self.news.add(line[0:-1]) f.close() self.file = open(FILE_NAME, "wb") for s in NEWSCATALOG["data"][1]["list"]: newdb.update({'newCatalog': s, 'newsList': []})
def process_item(self, item, spider): self.updatedNews.add(json.dumps(dict(item), ensure_ascii=False, sort_keys=True).encode('utf-8')) line = json.dumps(dict(item), ensure_ascii=False, sort_keys=True).encode('utf-8') + "\n" if self.currentCatalog != item["newCatalog"] and self.currentCatalog != "": newdb.update({'newCatalog': self.currentCatalog, 'newsList': self.currentCatalogList}) self.currentCatalogList = [] else : self.currentCatalogList.append(dict(item)) self.currentCatalog = item["newCatalog"] self.file.write(line) return item