def __init__(self, path, sleep_time=60*60, iter_to_save=10): self._tj_loader = TJLoader() self._vc_loader = VCLoader() self._rss_loader = RSSLoader() self._sleep_time = sleep_time self.PATH_TO_FILE = path self._iter_to_save = iter_to_save
class NewsCollector: def __init__(self, path, sleep_time=60*60, iter_to_save=10): self._tj_loader = TJLoader() self._vc_loader = VCLoader() self._rss_loader = RSSLoader() self._sleep_time = sleep_time self.PATH_TO_FILE = path self._iter_to_save = iter_to_save def load_new_news(self): logging.debug("Time is " + datetime.now().strftime('%Y-%m-%d %H:%M')) pages = self._tj_loader.get_tj_news_info() pages += self._vc_loader.get_cv_news_info() pages += self._rss_loader.get_news_array() logging.debug("{} news was loaded".format(len(pages))) return pages def _drop_duplicates(self, df): last_size = len(df) dupl = df["url"].duplicated() dupl = np.invert((dupl.as_matrix())) df = df[dupl] return df def _get_filename(self): folder_name = self.PATH_TO_FILE + datetime.now().strftime('%Y_%m_%d_%H') if not os.path.exists(folder_name): os.makedirs(folder_name) filename = datetime.now().strftime('%Y_%m_%d_%H_%M')+".csv" filename = folder_name + "/" + "news_" +filename return filename def _filter_date(self, df): """ Удаляет новости, которые были опубликованы до начала сбора информации :param df: :return: """ df = df[df["news_date"]>= self._start_date] return df def _prepare_to_save(self, df): df = self._filter_date(df) df = self._drop_duplicates(df) return df def load_news(self): is_empty_dataframe = True iter = 0 self._start_date = datetime.now().strftime('%Y-%m-%d %H:%M') while True: logging.debug("Going to sleep for {} seconds".format(self._sleep_time)) time.sleep(self._sleep_time) pages = self.load_new_news() if is_empty_dataframe: df = pd.DataFrame(pages) is_empty_dataframe = False else: df = df.append(pd.DataFrame(pages)) iter += 1 if iter%self._iter_to_save == 0: df = self._prepare_to_save(df) logging.debug("Saving... {} news".format(len(df))) df.to_csv(self._get_filename(), sep=",", index=False, encoding="utf-8", quoting=csv.QUOTE_NONNUMERIC) is_empty_dataframe = True self._start_date = datetime.now().strftime('%Y-%m-%d %H:%M')