def test_collect(start_expr: str, end_expr: str, start_cnt: int, end_cnt: int) -> None: collected = Collector.collect( user="******", date_range=DateRange( start=date.fromisoformat(start_expr), end=date.fromisoformat(end_expr), ), ) # Use mangled variable for test only assert collected._Extractor__user == "lntuition" assert collected._Extractor__df.iloc[0]["date"] == pd.Timestamp(start_expr) assert collected._Extractor__df.iloc[-1]["date"] == pd.Timestamp(end_expr) assert collected._Extractor__df.iloc[0]["count"] == pd.to_numeric( start_cnt) assert collected._Extractor__df.iloc[-1]["count"] == pd.to_numeric(end_cnt)
from src.collector import Collector from src.config.req import config from time import sleep if __name__ == '__main__': clc = Collector() while True: clc.collect() print('Sleeping...') sleep(config.sleep_time)
from src.collector import Collector if __name__ == "__main__": collector = Collector() #Для удобства я использую сортировку страниц сайта, поэтому такая ссылка starturl = "http://tenders.polyusgold.com/purchases/?NAME=&CODE=&BU=&PT=&ORDER=DATE_ACTIVE_TO%3AASC" archiveurl = "http://tenders.polyusgold.com/archive/?NAME=&BU=&PT=&ORDER=DATE_ACTIVE_TO%3ADESC" #Запускаем активные и архив по очереди,передавая статус и время p = collector.collect(starturl, "2", None) print("checking archive ...") collector.collect(archiveurl, "3", p)
from time import sleep from src.collector import Collector from settings import sleep_time if __name__ == "__main__": collector = Collector(quantity=None, publish_date=None, base_url='https://etpgpb.ru') while True: collector.collect() #break sleep(sleep_time)
from src.collector import Collector if __name__ == '__main__': Collector().collect()
phpsessid = 't4cjka7f4nedrmmq7dubipji16' # 日记内容保存路径 content_file_path = 'data/content.txt' # 日记时间保存路径 time_file_path = 'data/time.txt' # 词云图片保存路径 content_png_path = 'data/content.png' # 统计时间柱状图保存路径 time_png_path = 'data/time.png' # 爬取数据 # url: 某个人的饭否主页 # phpsessid: 登录饭否后浏览器 Cookie 中的值 # content_file_path: 日记内容保存路径 # time_file_path: 日记时间保存路径 Collector.collect(url, phpsessid, content_file_path, time_file_path) # 分词,制作词云 # content_file_path: 日记内容保存路径 # content_png_path: 词云图片保存路径 Analyzer.segment_and_visualize(content_file_path, content_png_path) # 统计发日记时间 # time_file_path: 日记时间保存路径 # time_png_path: 时间柱状图保存路径 Analyzer.count_times(time_file_path, time_png_path)
def initCollector(region): collector = Collector(region, interval) collector.start()
import os from datetime import date from src.collector import Collector from src.date import DateRange from src.extractor import Extractor from src.skeleton import SkeletonFactory from src.writer import Writer if __name__ == "__main__": factory = SkeletonFactory( language=os.environ["INPUT_LANGUAGE"], ) Writer( extractor=Collector.collect( user=os.environ["INPUT_USER"], date_range=DateRange( start=date.fromisoformat(os.environ["INPUT_START_DATE"]), end=date.fromisoformat(os.environ["INPUT_END_DATE"]), ), ), skeleton_string_map=factory.get_string_map(), skeleton_list_map=factory.get_list_map(), ).write( file_name=os.environ["INPUT_FILE_NAME"], )