Exemple #1
0
def test_collect(start_expr: str, end_expr: str, start_cnt: int,
                 end_cnt: int) -> None:
    collected = Collector.collect(
        user="******",
        date_range=DateRange(
            start=date.fromisoformat(start_expr),
            end=date.fromisoformat(end_expr),
        ),
    )

    # Use mangled variable for test only
    assert collected._Extractor__user == "lntuition"
    assert collected._Extractor__df.iloc[0]["date"] == pd.Timestamp(start_expr)
    assert collected._Extractor__df.iloc[-1]["date"] == pd.Timestamp(end_expr)
    assert collected._Extractor__df.iloc[0]["count"] == pd.to_numeric(
        start_cnt)
    assert collected._Extractor__df.iloc[-1]["count"] == pd.to_numeric(end_cnt)
Exemple #2
0
from src.collector import Collector

if __name__ == "__main__":
    collector = Collector()
    #Для удобства я использую сортировку страниц сайта, поэтому такая ссылка
    starturl = "http://tenders.polyusgold.com/purchases/?NAME=&CODE=&BU=&PT=&ORDER=DATE_ACTIVE_TO%3AASC"
    archiveurl = "http://tenders.polyusgold.com/archive/?NAME=&BU=&PT=&ORDER=DATE_ACTIVE_TO%3ADESC"
    #Запускаем активные и архив по очереди,передавая статус и время
    p = collector.collect(starturl, "2", None)
    print("checking archive ...")
    collector.collect(archiveurl, "3", p)
Exemple #3
0
from src.collector import Collector
from src.config.req import config
from time import sleep

if __name__ == '__main__':
    clc = Collector()

    while True:
        clc.collect()
        print('Sleeping...')
        sleep(config.sleep_time)
Exemple #4
0
from time import sleep
from src.collector import Collector
from settings import sleep_time

if __name__ == "__main__":
    collector = Collector(quantity=None,
                          publish_date=None,
                          base_url='https://etpgpb.ru')
    while True:
        collector.collect()
        #break
        sleep(sleep_time)
Exemple #5
0
    phpsessid = 't4cjka7f4nedrmmq7dubipji16'

    # 日记内容保存路径
    content_file_path = 'data/content.txt'

    # 日记时间保存路径
    time_file_path = 'data/time.txt'

    # 词云图片保存路径
    content_png_path = 'data/content.png'

    # 统计时间柱状图保存路径
    time_png_path = 'data/time.png'

    # 爬取数据
    # url: 某个人的饭否主页
    # phpsessid: 登录饭否后浏览器 Cookie 中的值
    # content_file_path: 日记内容保存路径
    # time_file_path: 日记时间保存路径
    Collector.collect(url, phpsessid, content_file_path, time_file_path)

    # 分词,制作词云
    # content_file_path: 日记内容保存路径
    # content_png_path: 词云图片保存路径
    Analyzer.segment_and_visualize(content_file_path, content_png_path)

    # 统计发日记时间
    # time_file_path: 日记时间保存路径
    # time_png_path: 时间柱状图保存路径
    Analyzer.count_times(time_file_path, time_png_path)
Exemple #6
0
import os
from datetime import date

from src.collector import Collector
from src.date import DateRange
from src.extractor import Extractor
from src.skeleton import SkeletonFactory
from src.writer import Writer

if __name__ == "__main__":
    factory = SkeletonFactory(
        language=os.environ["INPUT_LANGUAGE"],
    )

    Writer(
        extractor=Collector.collect(
            user=os.environ["INPUT_USER"],
            date_range=DateRange(
                start=date.fromisoformat(os.environ["INPUT_START_DATE"]),
                end=date.fromisoformat(os.environ["INPUT_END_DATE"]),
            ),
        ),
        skeleton_string_map=factory.get_string_map(),
        skeleton_list_map=factory.get_list_map(),
    ).write(
        file_name=os.environ["INPUT_FILE_NAME"],
    )