Beispiel #1
0
def get_data():
    executor = ThreadPoolExecutor(max_workers=WORKER)

    # get the res dir ready
    mkdir_res()

    # get url
    list(
        as_completed(
            executor.submit(get_chengjiao_house_url, ),
            executor.submit(get_ershoufang_house_url, ),
        ))

    # get ershoufang info
    list(
        as_completed(
            executor.submit(get_ershoufang_house_info, hs)
            for hs, name in HOUSE_DISTRICT_DICT.items()
            if not (DATA_DIR / "house_info" / "ershoufang" /
                    f"{hs}.json").is_file()))

    # get chengjiao info
    list(
        as_completed(
            executor.submit(get_chengjiao_house_info, hs)
            for hs, name in HOUSE_DISTRICT_DICT.items()
            if not (DATA_DIR / "house_info" / "chengjiao" /
                    f"{hs}.json").is_file()))

    # save to csv
    to_csv()
Beispiel #2
0
def run(start_page, end_page):
    with ThreadPoolExecutor(max_workers=8) as t:
        begin = time.time()
        obj_lst = []
        for i in range(start_page, end_page + 1):
            if i == 1:
                page_url = 'http://pic.netbian.com/index.html'
            else:
                page_url = f'http://pic.netbian.com/index_{i}.html'

            obj = t.submit(craw_detail, page_url)
            obj_lst.append(obj)
        for future in as_completed(obj_lst):
            data = future.result()
            # print(data)
            imgurl_lst.extend(data)
        print(imgurl_lst)
        print(len(imgurl_lst))
        print(f'耗时:{time.time() - begin}')

        begin = time.time()
        for imgurl in imgurl_lst:
            obj = t.submit(download, imgurl)
            obj_lst.append(obj)

        for future in as_completed(obj_lst):
            data = future.result()
        print(f'耗时:{time.time() - begin}')
def RunCOESim(generation, organisms):
    global currentGen
    currentGen = generation

    resultPath = os.path.join(
        baseResultPath,
        f"{scenario}-{generation}" if not scenario == "" else str(generation))

    # get only the distinct organisms to simulate in each generation otherwise we will run into problems eventually
    # has to be done sequentially annoyingly
    orgsToSimulate = []
    for organism in organisms:
        if organism not in orgsToSimulate:
            orgsToSimulate.append(organism)

    # Do some DP stuff so we can potentially cut down on amount of actual simulation
    if not generation == 0:
        with open(os.path.join(resultPath, "../GAResults.json"), "r") as f:
            resJson = json.load(f)

        with ThreadPoolExecutor(max_workers=threads) as executor:
            sims = {
                executor.submit(FindPreviousOrganisms, organism, generation,
                                resJson, resultPath, scenario, baseResultPath):
                organism
                for organism in orgsToSimulate
            }

            for result in as_completed(sims):
                if debugOutput:
                    print(
                        f"Found match in previous generation: {result.result()}"
                    )

    outputQ = []
    with ThreadPoolExecutor(max_workers=threads) as executor:
        sims = {
            executor.submit(defineRunAndEvaluateSimulation, parsedMMJson,
                            scenario, organism, dseConfig, resultPath,
                            basePath, threads > 1, coeConfig, debugOutput):
            organism
            for organism in orgsToSimulate
        }

        for result in as_completed(sims):
            outputQ.append(result.result())

    addSimulationDirToRankingFileThreaded(outputQ, resultPath)
Beispiel #4
0
    def launch(self):
        self._tonglian_init()
        self._yuqing_init()

        max_id, min_id = self.select_max_title_id()
        print("news_id 的范围是: ", min_id, max_id)
        for i in range(min_id // self.batch_num, max_id // self.batch_num + 1):
            news_id_start = self.batch_num * i
            news_id_end = self.batch_num * (i + 1)
            print("当前范围是: ", news_id_start, news_id_end)
            sql = '''select T.NEWS_ID, T.NEWS_PUBLISH_TIME, T.NEWS_TITLE, T.NEWS_PUBLISH_SITE, B.NEWS_BODY \
from vnews_content_v1 T, vnews_body_v1 B \
where T.NEWS_ID >= {} and T.NEWS_ID <= {} \
and B.NEWS_ID >= {} and B.NEWS_ID <= {}  \
and T.NEWS_ID = B.NEWS_ID \
and T.NEWS_PUBLISH_TIME between '{}' and '{}'; '''.format(
                news_id_start, news_id_end, news_id_start, news_id_end,
                self.start_time, self.end_time)
            print("sql: ", sql)
            datas = self.tonglian_client.select_all(sql)
            print("当前数据量是: ", len(datas))

            items = []
            with ThreadPoolExecutor(max_workers=10) as t:
                res = [t.submit(self.post_api, data) for data in datas]
            for future in as_completed(res):
                item = future.result()
                if item:
                    # print(">>> ", item)
                    items.append(item)

            print(len(items))
            self._batch_save(self.yuqing_client, items, self.target_table_name,
                             self.target_fields)
            self.yuqing_client.end()
Beispiel #5
0
    def launch2(self):
        self._tonglian_init()
        self._yuqing_init()

        dt = self.start_time
        while dt <= self.end_time:
            end_dt = dt + datetime.timedelta(days=1)
            sql = '''select T.NEWS_ID, T.NEWS_URL, T.NEWS_ORIGIN_SOURCE, T.NEWS_PUBLISH_TIME, T.NEWS_TITLE, T.NEWS_PUBLISH_SITE, B.NEWS_BODY \
from vnews_content_v1 T, vnews_body_v1 B \
where T.NEWS_PUBLISH_TIME between '{}' and '{}' \
and T.NEWS_ID = B.NEWS_ID; '''.format(dt, end_dt)
            print(sql)
            datas = self.tonglian_client.select_all(sql)
            print("当前数据量是: ", len(datas))
            items = []
            with ThreadPoolExecutor(max_workers=10) as t:
                res = [t.submit(self.post_api, data) for data in datas]
            for future in as_completed(res):
                item = future.result()
                if item:
                    items.append(item)
            print(len(items))
            self._batch_save(self.yuqing_client, items, self.target_table_name,
                             self.target_fields)
            self.yuqing_client.end()
            dt = end_dt
def extract_all_fuz(cleaned_output: Path, work_dir: Path):
    """
    Extracts FUZ files into XWM files.
    """
    fuz_dir = work_dir / "extracted"
    xwa_dir = work_dir / "wav"

    # This is entirely I/O based so it's a good candidate to run it in parallel.
    candidates = []

    with cleaned_output.open(mode="r", newline="") as f:
        reader = csv.DictReader(f)
        for line in reader:
            infile = fuz_dir / line["original_path"].lower()
            infile = infile.with_suffix(".fuz")
            outfile = xwa_dir / line["original_path"].lower()
            outfile = outfile.with_suffix(".xwm")
            outfile.parent.mkdir(parents=True, exist_ok=True)
            candidates.append((infile, outfile))

    count = 0

    with ThreadPoolExecutor() as exe:
        futures = [exe.submit(lambda tp: extract_fuz(*tp), i) for i in candidates]
        for i in as_completed(futures):
            count += 1

            if count % 100 == 0:
                print(f"Extracted {count} files...")
Beispiel #7
0
def main():
    Q = QiniuProvider()
    countries = parseTable(
        Link(f'{domain}/wiki/List_of_IOC_country_codes').getText())
    beginTime = datetime.now()
    with ThreadPoolExecutor(max_workers=10) as pool:
        allTasks = []

        sqlFile = open('countryList.sql', 'w', encoding="utf-8")
        sqlWriter = SQLExporter(sqlFile, 'nationality',
                                ['name', 'code', 'flag'])

        for country in countries:
            print(f'graping {country.name}...')
            allTasks.append(
                pool.submit(
                    executor,
                    ExecutorParams(q=Q, country=country, writer=sqlWriter)))

        for task in as_completed(allTasks):
            print(f'{task.result()} downloaded.')

        sqlFile.close()

        endTime = datetime.now()

        print(f'run time: {endTime - beginTime}')
Beispiel #8
0
    def generate_files_multithreaded(self, query, query_right=None):
        query_to_run_left = query
        query_to_run_right = query
        if query_right:  # if only one query is supplied, run the same query on both connections
            query_to_run_right = query_right  # if a second query is supplied for the right side, set it here.

        left_stx = SqlToXl(self.left_connection_string)
        right_stx = SqlToXl(self.right_connection_string)

        futures = []
        with ThreadPoolExecutor(max_workers=2) as executor:
            futures.append(
                executor.submit(
                    left_stx.save_sql,
                    *[query_to_run_left, self.left_file_path,
                      self.left_sheet]))
            futures.append(
                executor.submit(
                    right_stx.save_sql, *[
                        query_to_run_right, self.right_file_path,
                        self.right_sheet
                    ]))

        for f in as_completed(futures):
            if f.exception():
                logging.error("recived Exception from thread {}".format(
                    f.exception()))
                raise f.exception()
            else:
                logging.info("recived result from thread {}".format(
                    f.result()))

        return self.left_file_path, self.right_file_path
    def run_desired_simulataneous_get_object_calls(
        self,
        s3_olap_arn,
        file_name,
        expected_error,
        connection_counts=DEFAULT_CONNECTIONS,
        period_in_minutes=DEFAULT_PERIOD,
    ):
        logging.info(
            f"Running Load Test for file : {file_name}  for period {period_in_minutes} with connection counts: {connection_counts}"
        )
        s = ThreadPoolExecutor(max_workers=connection_counts)
        futures = [
            s.submit(self.fail_safe_fetch_s3_object,
                     s3_olap_arn,
                     file_name,
                     period=period_in_minutes * 60,
                     expected_error=expected_error)
            for i in range(0, connection_counts)
        ]

        total_counts = 0
        successful_counts = 0
        average_latency = 0
        for f in as_completed(futures):
            successful_counts += f.result()[1]
            total_counts += f.result()[0] + f.result()[1]
            average_latency = f.result()[2] / total_counts

        logging.info(
            f" Total calls made: {total_counts}, out of which {successful_counts} calls were successful."
            f" ({successful_counts / total_counts * 100}%) ,Average Latency {average_latency}"
        )
        return total_counts, total_counts - successful_counts, average_latency
    def contains_pii_entities(
            self,
            documents: List[Document],
            language=DEFAULT_LANGUAGE_CODE) -> List[Document]:
        """Call comprehend to get pii classification of given documents."""
        documents_copy = deepcopy(documents)
        result = []
        with self.classification_executor_service:
            futures = []
            for doc in documents_copy:
                futures.append(
                    self.classification_executor_service.submit(
                        self._update_doc_with_pii_classification, doc,
                        language))

            for future_result in as_completed(futures):
                try:
                    result.append(future_result.result())
                except Exception as error:
                    LOG.error(
                        "Error occurred while calling comprehend for classifying text as pii",
                        exc_info=True)
                    self.classify_metrics.add_fault_count()
                    raise error
        return result
Beispiel #11
0
def reap(
        path,
        known_bad_packages=(),
        number_to_reap=1000,
):
    if not os.path.exists(path):
        os.makedirs(path)
    sorted_files = list(diff(path))
    print(f"TOTAL OUTSTANDING ARTIFACTS: {len(sorted_files)}")
    sorted_files = sorted_files[:number_to_reap]

    with executor(max_workers=5, kind="dask") as pool:
        futures = {
            pool.submit(
                fetch_and_run,
                path,
                package,
                dst,
                src_url,
                # progress.update
            ): (package, dst, src_url)
            for package, dst, src_url in sorted_files
            if (src_url not in known_bad_packages)
        }
        for f in tqdm(as_completed(futures), total=len(sorted_files)):
            try:
                f.result()
            except ReapFailure as e:
                print(f"FAILURE {e.args}")
            except Exception:
                pass
Beispiel #12
0
def report_conda_forge_names_from_import_map(total_imports,
                                             builtin_modules=None,
                                             ignore=None):
    if ignore is None:
        ignore = []
    if builtin_modules is None:
        builtin_modules = _builtin_modules
    report_keys = [
        'required', 'questionable', 'builtin', 'questionable no match',
        'required no match'
    ]
    report = {k: set() for k in report_keys}
    import_to_pkg = {k: {} for k in report_keys}
    import_to_artifact = {k: {} for k in report_keys}
    futures = {}

    with ThreadPoolExecutor() as pool:
        for name, md in total_imports.items():
            if all([
                    any(
                        fnmatch(filename, ignore_element)
                        for ignore_element in ignore) for filename, _ in md
            ]):
                continue
            elif recursively_search_for_name(name, builtin_modules):
                report['builtin'].add(name)
                continue
            future = pool.submit(extract_pkg_from_import, name)
            futures[future] = md
    for future in as_completed(futures):
        md = futures[future]
        most_likely_pkg, _import_to_artifact, _import_to_pkg = future.result()

        for (filename, lineno), import_metadata in md.items():
            # Make certain to throw out imports, since an import can happen multiple times
            # under different situations, import matplotlib is required by a test file
            # but is questionable for a regular file
            if any(
                    fnmatch(filename, ignore_element)
                    for ignore_element in ignore):
                continue
            if any(
                    import_metadata.get(v, False)
                    for v in SKETCHY_TYPES_TABLE.values()):
                # if we couldn't find any artifacts to represent this then it doesn't exist in our maps
                if not _import_to_artifact:
                    report_key = 'questionable no match'
                else:
                    report_key = 'questionable'
            else:
                # if we couldn't find any artifacts to represent this then it doesn't exist in our maps
                if not _import_to_artifact:
                    report_key = 'required no match'
                else:
                    report_key = 'required'

            report[report_key].add(most_likely_pkg)
            import_to_pkg[report_key].update(_import_to_pkg)
            import_to_artifact[report_key].update(_import_to_artifact)
    return report, import_to_artifact, import_to_pkg
Beispiel #13
0
    def test_emergency_shutdown(self, mock_scan_commands):
        # Given a lot of servers to scan
        total_server_scans_count = 100
        server_scans = [
            ServerScanRequest(
                server_info=ServerConnectivityInfoFactory.create(),
                scan_commands={ScanCommandForTests.MOCK_COMMAND_1, ScanCommandForTests.MOCK_COMMAND_2},
            )
            for _ in range(total_server_scans_count)
        ]

        # And the scans get queued
        scanner = Scanner()
        for scan in server_scans:
            scanner.queue_scan(scan)

        # When trying to quickly shutdown the scanner, it succeeds
        scanner.emergency_shutdown()

        # And all the queued jobs were done or cancelled
        all_queued_futures = []
        for server_scan in scanner._queued_server_scans:
            all_queued_futures.extend(server_scan.all_queued_scan_jobs)
        for completed_future in as_completed(all_queued_futures):
            assert completed_future.done()
    def post_api(self, datas):
        params = []
        for data in datas:
            title = data.get("Title2")
            if not title:
                title = data.get('SecuAbbr') + data.get("Title1")
            req_data = {
                'texttype': 'ann',
                'title': title,
                'content': title,
                'prolist': ['event_ann'],
            }
            params.append((req_data, data, title))
        items = []

        with ThreadPoolExecutor(max_workers=10) as t:
            res = [t.submit(self.post_task, *param) for param in params]
        for future in as_completed(res):
            item = future.result()
            if item:
                items.append(item)

        # for param in params:
        #     try:
        #         item = self.post_task(*param)
        #     except:
        #         item = None
        #     if item:
        #         items.append(item)

        return items
Beispiel #15
0
 def fetchDataAsList(self):
     
     with ThreadPoolExecutor(max_workers=10) as executor:
         futures = []
         for source in self.subSources:
             futures.append(executor.submit(source.fetchDataAsList))
             
         for future in as_completed(futures):
             print('completed load of source data in list')
    def have_valid_relative_data(self, data, relative="child_relation"):
        with ProcessPoolExecutor() as executor:
            res = []
            for index, relation_meta in enumerate(self.metadata_c.metadata["sequential_info"][relative]):
                res.append(
                    executor.submit(self._search_file, relation_meta["path_of_child_table"], index, data, relative))

            for f in as_completed(res):
                if not f.result():
                    return False

        return True
Beispiel #17
0
    def scan_ports(self,
                   ports: typing.Iterable[int]) -> typing.List[ScanResult]:
        futures = []
        with Halo(text="***** Port scanning in progress...", color="blue"):
            with self.executor as executor:
                for port in ports:
                    futures.append(executor.submit(self.scan, port))

            return [
                x for x in [f.result() for f in as_completed(futures)]
                if x.is_open
            ]
Beispiel #18
0
def fix_metadata(date, workers):
    uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml"

    fetcher = S3Fetcher(aws_unsigned=True)
    s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher)
    s3_url_stream = (o.url for o in s3_obj_stream)
    data_stream = list(fetcher(s3_url_stream))
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream]
        
        for future in as_completed(futures):
            if future.exception() is not None:
                raise future.exception()
Beispiel #19
0
def find(host_list: list):
    records = []

    with ThreadPoolExecutor(max_workers=cpu_count() + 1) as pool:
        task_list = []

        for host in host_list:
            obj = pool.submit(parse_url, host.strip().replace("\n", ""))
            task_list.append(obj)

        for task in as_completed(task_list):
            print(task.result())
            records.append(task.result())
Beispiel #20
0
def t1():
    def job(a):
        time.sleep(random.random())
        return a * 2

    with ThreadPoolExecutor(max_workers=50) as executor:
        future_to_param = {}
        for i in range(30):
            future_to_param[executor.submit(job, i)] = i

        for future in as_completed(future_to_param):
            result = future.result()
            print(f"{future_to_param[future]} -> {result}")
def main(granule_ids, sns_topic_arn, workers):
    """
    Script to sync Sentinel-2 data from NCI to AWS S3 bucket
    Pass in a file containing destination S3 urls that need to be uploaded.
    """

    setup_logging()

    granule_ids = [granule_id.strip() for granule_id in granule_ids.readlines()]

    _LOG.info(f"{len(granule_ids)} granules to upload.")
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(upload_granule, granule_id, sns_topic_arn) for granule_id in granule_ids]

        for future in tqdm(as_completed(futures), total=len(granule_ids), unit='granules', disable=None):
            _LOG.info(f"Completed uploaded: {future.result()}")
def main(args, device, num_available_devices):
    model_path = Path(args.model)
    root_dir = Path(args.root_dir)

    image_paths = [
        file_name for file_name in root_dir.glob('**/*') if is_image(file_name)
    ]
    analyzed_images = []

    ctx = multiprocessing.get_context('forkserver')
    executor = ProcessPoolExecutor(max_workers=num_available_devices,
                                   mp_context=ctx,
                                   initializer=init_process,
                                   initargs=(model_path, not args.no_split,
                                             device))

    try:
        with executor:
            current_jobs = []
            for i, image_path in enumerate(image_paths):
                submitted_job = executor.submit(
                    consumer, image_path,
                    str(image_path.relative_to(root_dir)))
                current_jobs.append(submitted_job)

            for job in tqdm(as_completed(current_jobs),
                            total=len(current_jobs)):
                try:
                    result = job.result()
                    analyzed_images.append(result)
                except Exception as e:
                    print(f"Could not process {str(image_path)}, reason: {e}")
                    traceback.print_exc(file=sys.stdout)
    except KeyboardInterrupt:
        pass

    with (root_dir / 'handwriting_analysis.json').open('w') as f:
        json.dump(analyzed_images, f, indent='\t')

    num_has_handwriting = len(
        [im for im in analyzed_images if im['has_handwriting']])
    print(
        f"Handwriting to no handwriting ratio: {num_has_handwriting / len(analyzed_images)}"
    )
Beispiel #23
0
def main(s3_urls, workers):
    """
    Script to sync Sentinel-2 data from NCI to AWS S3 bucket

    Pass in a file containing destination S3 urls that need to be uploaded.

    """
    setup_logging()

    global S3
    S3 = s3_client()
    urls_to_upload = [url.strip() for url in s3_urls.readlines()]

    _LOG.info(f"{len(urls_to_upload)} datasets to upload.")
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(upload_dataset, s3_url) for s3_url in urls_to_upload]

        for future in tqdm(as_completed(futures), total=len(urls_to_upload), unit='datasets', disable=None):
            _LOG.info(f"Completed uploaded: {future.result()}")
Beispiel #24
0
def compare_grayskull_audits(gx):
    grayskull_files = os.listdir("audits/grayskull")
    bad_inspections = {}

    if "_net_audit.json" in grayskull_files:
        grayskull_files.pop(grayskull_files.index("_net_audit.json"))
        with open("audits/grayskull/_net_audit.json") as f:
            bad_inspections = load(f)

    futures = {}
    with executor("dask", max_workers=20) as pool:

        for node, attrs in gx.nodes("payload"):
            if not attrs.get("version"):
                continue
            node_version = f"{node}_{attrs['version']}"
            if node_version in bad_inspections:
                continue
            # construct the expected filename
            expected_filename = f"{node_version}.yml"
            if expected_filename in grayskull_files:
                with open(
                    os.path.join("audits/grayskull", expected_filename),
                ) as f:
                    meta_yaml = f.read()
                futures[
                    pool.submit(
                        inner_grayskull_comparison,
                        meta_yaml=meta_yaml,
                        attrs=attrs,
                        node=node,
                    )
                ] = node_version
        for future in as_completed(futures):
            try:
                bad_inspections[futures[future]] = future.result()
            except Exception as e:
                bad_inspections[futures[future]] = str(e)

    with open("audits/grayskull/_net_audit.json", "w") as f:
        dump(bad_inspections, f)
    return bad_inspections
Beispiel #25
0
    def launch(self):
        self._yuqing_init()  # 2020-09-01 - 2020-10-01
        end_time = datetime.datetime(2020, 11, 6)
        start_time = datetime.datetime(2020, 10, 20)

        dt = start_time
        while dt <= end_time:
            dt_next = dt + datetime.timedelta(days=1)

            limit_start = 0
            while True:  # TODO 1002 东财 1007 同花顺
                sql = '''select * from {} where OrgTableCode = '1002' and PubDatetime >= '{}' and PubDatetime <= '{}' order by id limit {}, {};'''.format(
                    self.source_table,
                    dt,
                    dt_next,
                    limit_start * self.batch_num,
                    self.batch_num,
                )
                print(sql)
                datas = self.yuqing_client.select_all(sql)
                print("select datas: ", len(datas))

                if len(datas) == 0:
                    break

                items = []
                with ThreadPoolExecutor(max_workers=10) as t:
                    res = [t.submit(self.post_api, data) for data in datas]
                for future in as_completed(res):
                    item = future.result()
                    if item:
                        items.append(item)

                print(limit_start, len(items))
                if items:
                    self._batch_save(self.yuqing_client, items,
                                     self.target_table, self.target_fields)
                    self.yuqing_client.end()

                limit_start += 1
            dt = dt_next
Beispiel #26
0
    def get_links(self):
        if not self.urls:
            print("done, image links parsed: %d" % len(self.images))
            print(*self.images, sep='\n')
            return self.images

        with ThreadPoolExecutor(cpu_count()) as executor:
            future_to_page = {
                executor.submit(Scraper.one_page_crawl, page_url): page_url
                for page_url in self.urls
            }
            for future in as_completed(future_to_page):
                url_done = future_to_page[future]
                links, images = future.result()
                self.images |= images
                self.done.add(url_done)
                self.urls.remove(url_done)
                self.urls |= links - self.done

        print("\nurls to crawl: %s\nurls done: %s\n" % (self.urls, self.done))
        return self.get_links()
Beispiel #27
0
def parse_parallel(parser: AppStoreParser, max_workers=20) -> List[Review]:
    """
    Parse app reviews in parallel

    :AppStoreParser parser: parser object
    :int max_workers: the maximum number of threads that can be used to parse reviews
    """

    rating_count = parser.get_app_rating_count()
    LOGGER.info(f'App "{parser.app_name}" has {rating_count} reviews')
    if rating_count > MAX_REVIEWS:
        rating_count = MAX_REVIEWS
        LOGGER.warning(f'App "{parser.app_name}" has more than {MAX_REVIEWS} reviews')

    last_page = rating_count // REVIEWS_PER_PAGE
    LOGGER.info(f'Reviews to scan: {rating_count}')

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        pages_range = range(1, last_page + 1)
        future_to_page = {
            executor.submit(parser.get_reviews_page, page): page for page in pages_range
        }
        for future in as_completed(future_to_page):
            page = future_to_page[future]
            try:
                reviews = future.result()
            except Exception as exc:
                LOGGER.error(f'Exception on page #{page:03d}: {exc!r}')
            else:
                LOGGER.info(f'Page #{page:03d} successfully scanned')
                results.append((page, reviews))

    results.sort(key=itemgetter(0))
    reviews = [r for reviews_from_page in results for r in reviews_from_page[1]]
    LOGGER.info(f'Scanned reviews: {len(reviews)}')
    return reviews
Beispiel #28
0
def find_supplying_version_set(
        volume, get_symbol_table_func=web_interface.get_symbol_table):
    supplying_versions = {}

    effective_volume = sorted(volume - builtin_symbols)
    symbol_by_top_level = groupby(effective_volume,
                                  key=lambda x: x.partition(".")[0])
    bad_symbols = set()

    with ThreadPoolExecutor() as pool:
        futures = {
            pool.submit(get_supply, top_level_import, list(v_symbols),
                        get_symbol_table_func): top_level_import
            for top_level_import, v_symbols in symbol_by_top_level
        }
    for future in as_completed(futures):
        top_level_import = futures[future]
        supplies, bad = future.result()
        supplying_versions[top_level_import] = supplies
        bad_symbols.update(bad)
    # TODO: handle the case where multiple pkgs export the same symbols?
    #  In that case we may want to merge those together somehow
    # TODO: handle case where no pkg supports the symbol?
    return supplying_versions, bad_symbols
Beispiel #29
0
def main(n_to_pull=1000):
    path = "audit"

    if os.path.exists(os.path.join(path, "_inspection_version.txt")):
        with open(os.path.join(path, "_inspection_version.txt")) as f:
            db_version = f.read()
    else:
        db_version = ""
    if db_version != complete_version and os.path.exists(path):
        shutil.rmtree(path)

    if not os.path.exists(path):
        os.makedirs(path)
    with open(os.path.join(path, "_inspection_version.txt"), "w") as f:
        f.write(complete_version)

    all_extracted_artifacts = web_interface.get_current_extracted_pkgs()
    existing_artifacts = glob.glob(f"{path}/**/*.json", recursive=True)
    existing_artifact_names = {k.partition("/")[2] for k in existing_artifacts}

    artifacts = sorted(
        list(set(all_extracted_artifacts) - set(existing_artifact_names)))

    # Don't have the artifacts in alphabetical order
    shuffle(artifacts)

    with ThreadPoolExecutor() as pool:
        futures = [
            pool.submit(inner_loop_and_write, artifact)
            for artifact in artifacts[:n_to_pull]
        ]
        for future in tqdm(as_completed(futures), total=n_to_pull):
            try:
                future.result()
            except requests.exceptions.ConnectionError:
                pass
Beispiel #30
0
    def handle(self, *args, **options):
        url = options['url']
        tmp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
        start = time.time()

        # Открываем соединение для скачивания большого файла
        with requests.get(url, stream=True) as r:
            r.raise_for_status()

            # Счетчик i для ограничения скачивания
            i = 0
            for chunk in r.iter_lines(chunk_size=100000, decode_unicode=True):
                if i >= MAX_LINES_COUNT:  # Около 100мб
                    break

                if not chunk:
                    continue

                # Случайное распределение по файлам в зав-ти от кол-ва ядер системы (порядок строк неважен)
                with open(f'{tmp_path}/access_log{random.randint(0, min(32, (os.cpu_count() or 1) + 4) - 1)}', 'at') as f:
                    f.write(f'{chunk}\n')
                i += 1

        download_time = time.time()
        print(f'Скачивание логов: {download_time - start}')

        # Создаем пул потоков, отправляем в каждый функцию для чтения файла, ожидаем строку с завершением обработки
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(self.bulk_create_logs, f'{tmp_path}/{filename}') for filename in os.listdir(tmp_path)]
            for idx, future in enumerate(as_completed(futures)):
                print(f'Файл №{idx} обработан')

        print(f'Импорт завершен. Время обработки: {time.time() - download_time}')
        if DELETE_AFTER_IMPORT:
            for filename in os.listdir(tmp_path):
                os.unlink(f'{tmp_path}/{filename}')
Beispiel #31
0
    return all_symbol_tables


if __name__ == "__main__":
    web_interface = WebDB()
    extracted_artifacts = web_interface.get_current_symbol_table_artifacts()
    all_artifacts = web_interface.get_current_extracted_pkgs().values()

    artifacts_to_index = list(set(all_artifacts) - set(extracted_artifacts))
    print(f"Number of artifacts to index: {len(artifacts_to_index)}")

    # The shuffle here is to try to not have two threads running on the same symbol table json at once if possible
    shuffle(artifacts_to_index)
    pool = ThreadPoolExecutor()
    # Note that this is a race condition here, two threads could try to write to the same symbol table
    # however one of those will win so next round there will be one added safely and this continues
    # until none are left to be added
    print("issuing futures")
    futures = {
        pool.submit(inner_loop, artifact_name): artifact_name
        for artifact_name in tqdm(artifacts_to_index[:10000])
    }
    print("awaiting futures")
    for future in tqdm(as_completed(futures), total=len(futures)):
        print(futures[future])
        try:
            future.result()
        except Exception as e:
            print(e)
    pool.shutdown()