Example #1
0
    def get_process(process, level):

        if level:
            process_data = db.process.find(
                {"process_number": format_proc_number(process), "level": level}
            )
        else:
            process_data = db.process.find(
                {"process_number": format_proc_number(process)}
            )
        if not process_data:
            print(f"Process number {process} was not found")
        else:
            print([data for data in process_data])
Example #2
0
def create_params_2instance(process_number: str) -> Dict[str, str]:
    params = {
        "cbPesquisa": "NUMPROC",
        "tipoNuProcesso": "UNIFICADO",
        "numeroDigitoAnoUnificado": process_number[:15],
        "foroNumeroUnificado": process_number[-4:],
        "dePesquisaNuUnificado": format_proc_number(process_number),
    }
    return params
Example #3
0
    def crawler(process, start_year, overwrite):
        if process and not overwrite:
            process_data = db.process.find_one(
                {"process_number": format_proc_number(process)}
            )
            if process_data:
                logger.info("process exists")
                return

        if process:
            logger.info(f"run crawler: {process}")
            execute_spider_worker(process)
        elif start_year:
            crawler_many(start_year)
Example #4
0
def crawler_many(start_year):
    def generate(start_date, end_date):
        for count in range(end_date - start_date + 1):
            yield start_date + count

    end_year = datetime.now().year
    start_year = int(start_year)

    for year in generate(start_year, end_year):
        generate_al = create_numbers(year, "12")
        generate_ms = create_numbers(year, "02")

        for number in chain(generate_al, generate_ms):
            logger.info(number)
            if not db.process.find_one({"process_number": format_proc_number(number)}):
                execute_spider_worker.send(number, True)
Example #5
0
def index():
    content = request.json

    if "process_number" in content and validate(content["process_number"]):
        number = format_proc_number(content["process_number"])
        process_data = list(db.process.find({"process_number": number}))

        if process_data:
            return (
                {
                    "status":
                    "success",
                    "data":
                    json.dumps([data for data in process_data], default=str),
                },
                200,
            )
        else:
            execute_spider_worker(number, subprocess=True)
            return {"status": "processing", "data": []}, 200
    return {}, 422
Example #6
0
def execute_spider_worker(process_number, subprocess=False):
    url1_instance, url2_instance = get_tj_url(
        format_proc_number(process_number))
    logger.info("execute_spider_worker %s %s" % (url1_instance, url2_instance))

    if subprocess:
        map(
            lambda x: x.join(),
            [
                start_crawler_process(
                    TJ1Crawler, url1_instance, process_number=process_number),
                start_crawler_process(
                    TJ2Crawler, url2_instance, process_number=process_number),
            ],
        )
    else:
        try:
            crawler_process = CrawlerProcess(settings={})
            crawler_process.crawl(
                TJ1Crawler,
                starting_url=url1_instance,
                process_number=process_number,
                params=create_params_1instance(process_number),
            )

            crawler_process.crawl(
                TJ2Crawler,
                starting_url=url2_instance,
                process_number=process_number,
                params=create_params_2instance(process_number),
            )
            crawler_process.start()
        except Exception:
            import traceback

            logger.error(traceback.format_exc())
            return "error", traceback.format_exc()
        else:
            return "ok"