コード例 #1
0
ファイル: main.py プロジェクト: adamturn/port-spider
def get_history():
    """Download historical vessel schedules."""

    src_dir = get_src_dir_path(__file__)
    data_dir = src_dir.parent / "data"

    start_date = datetime.date(year=2020, month=1, day=1)
    end_date = datetime.date(year=2020, month=1, day=10)
    time_delta = datetime.timedelta(days=1)

    file_paths = []
    jocasta = Spider()
    while start_date <= end_date:
        time.sleep(random.randrange(0, 5, 1))
        file_path = jocasta.crawl(data_dir, date=start_date)
        file_paths.append(file_path)
        start_date += time_delta
        
    for file_path in file_paths:
        if file_path.endswith('.xlsx'):
            print("unhandled xlsx file!")

        elif file_path.endswith('.pdf'):
            tables = Wrangler.parse_pdf(file_path)

        else:
            raise ValueError(f"Unexpected file type: {file_path}")

    return None
コード例 #2
0
ファイル: main.py プロジェクト: adamturn/port-spider
def main():
    """Main spider program processes the latest vessel schedule."""

    src_dir = get_src_dir_path(__file__)
    data_dir = src_dir.parent / "data"

    # TODO: testing on Nov 11, 2020
    file_path = Spider().crawl(data_dir, date=datetime.date(year=2020, month=11, day=11))
    # file_path = Spider().crawl(data_dir)

    if file_path.endswith('.xlsx'):
        print("unhandled xlsx file!")

    elif file_path.endswith('.pdf'):
        tables = Wrangler.parse_pdf(file_path)
        breakpoint()
    else:
        raise ValueError(f"Unexpected file type: {file_path}")

    return None