Exemple #1
0
    def update_realtime_storage(self):
        """
        scratch realtime data and store it locally
        """
        args = []
        for i in range(0, 5):
            args.append((i, self.reg, self.reg_sym))
        not_create = True
        while not_create:
            try:
                p = ThreadPool()
                not_create = False
            except RuntimeError as e:
                time.sleep(3)
                not_create = True

        dict_list = p.starmap(process_plaintext, args)

        args_remain = []
        for i in range(5, tm.CODE_SEGMENT_NUM):
            args_remain.append((i, self.reg, self.reg_sym))
        dict_list_remain = p.starmap(process_plaintext, args_remain)
        dict_curr = {k: v for dic in dict_list for k, v in dic.items()}
        dict_curr_remain = {
            k: v
            for dic in dict_list_remain for k, v in dic.items()
        }
        self.realtime_quotes = {**dict_curr, **dict_curr_remain}
Exemple #2
0
def crawl_listed_companies(options, workers_num=10):
    companies = []

    pool = Pool(processes=workers_num)

    try:
        func_params = []
        companies_initials = options.get("crawling_initials", COMPANIES_LISTING_SEARCHER_LETTERS)

        if not companies_initials:
            companies_initials = COMPANIES_LISTING_SEARCHER_LETTERS

        for letter in companies_initials:
            func_params.append([letter, options])

        pool.starmap(update_listed_companies, func_params)

        # Merge all the responses into one only list
        # companies += list(
        #    itertools.chain.from_iterable(call_results))

        return companies
    except TimeoutError:
        _logger.exception("Timeout error")
        raise
    finally:
        pool.close()
        pool.join()
        pool.terminate()
Exemple #3
0
def parse_url(url, postid=0, commentid=0):
    """ Gets image hash(es) from URL, populates database """

    if is_direct_link(url):
        parse_image(url, postid, commentid)
        return True

    if not should_parse_link(url):
        return

    image_urls = get_image_urls(url)
    url = clean_url(url)

    # We assume that any url that yields more than 1 image is an album
    albumid = 0
    if len(image_urls) > 1:
        albumid = get_or_create_album(url)

    if len(image_urls) > 10:
        logger.debug("Using multithreading to download large album")
        pool = ThreadPool(processes=10)
        pool.starmap(func=parse_image,
                     iterable=zip(image_urls, repeat(postid),
                                  repeat(commentid), repeat(albumid)))
        pool.close()
    else:
        for image_url in image_urls:
            parse_image(image_url, postid, commentid, albumid)
    return True
Exemple #4
0
    def push(self, x: Union[str, List[str]], quiet: bool = False):
        """Push a tag or a repository to a registry

        Alias: `docker.push(...)`

        # Arguments
            x: Tag(s) or repo(s) to push. Can be a string or a list of strings.
                If it's a list of string, python-on-whales will push all the images with
                multiple threads. The progress bars might look strange as multiple
                processes are drawing on the terminal at the same time.
            quiet: If you don't want to see the progress bars.

        # Raises
            `python_on_whales.exceptions.NoSuchImage` if one of the images does not exists.
        """
        x = to_list(x)

        # this is just to raise a correct exception if the images don't exist
        self.inspect(x)

        if len(x) == 0:
            return
        elif len(x) == 1:
            self._push_single_tag(x[0], quiet=quiet)
        elif len(x) >= 2:
            pool = ThreadPool(4)
            generator = self._generate_args_push_pull(x, quiet)
            pool.starmap(self._push_single_tag, generator)
            pool.close()
            pool.join()
    def main(self):
        if self.check_if_crawled:
            file_path = get_content_file_path(date=self.date,
                                              from_s3=self.from_s3)
            empty = check_if_file_is_empty(file_path, from_s3=self.from_s3)
            if not empty:
                print("Already Parsed", file_path)
                return

        link_data_list = self.iom.read_links_from_file(self.date)
        self.total_links_count = len(link_data_list)
        print("START Content Crawling", self.date, self.total_links_count)

        if self.thread_for_each_request:
            i = 0
            print("start thread", self.threads_count)
            pool = ThreadPool(self.threads_count)
            pool.starmap(self.crawl_single_article,
                         zip(link_data_list, repeat(self.log_progress)))
            # close the pool and wait for the work to finish
            pool.close()
            pool.join()
        else:
            i = 0
            for link_data in link_data_list:
                i += 1
                self.crawl_single_article(link_data)
                self.log_progress()

        self.callback(self.date, self.content_data_list)
        return
def stop_services_and_wait_for_tasks_to_stop(cluster, services, timeout,
                                             region):
    ecs_client = _get_ecs_client(region)

    THREAD_MAX = 32
    number_of_services = len(services)

    all_running_service_tasks = []

    for service in services:
        running_service_tasks = get_tasks_for_service(cluster=cluster,
                                                      service=service,
                                                      region=region)

        if running_service_tasks:
            all_running_service_tasks += running_service_tasks
            stop_service(cluster=cluster, service=service, region=region)
        else:
            LOGGER.info(
                "No active tasks found in service '{}'".format(service))

    if all_running_service_tasks:
        pool = ThreadPool(number_of_services
                          if number_of_services < THREAD_MAX else THREAD_MAX)
        pool.starmap(wait_for_task_to_stop,
                     ((cluster, task, timeout, region)
                      for task in all_running_service_tasks))
        pool.close()
        pool.join()
    def post(self):
        uploaded_files = request.files.getlist("file")
        private = "private" == request.form.get("access_type")
        presigned_posts = []

        for file in uploaded_files:
            if (file.filename == ""):
                flash("You must select at least one image!", 'danger')
                return render_template("upload.html"), 400

            identifier = str(uuid.uuid4())
            image = ImageModel(session['username'], identifier,
                               datetime.datetime.now(), private)
            try:
                image.save_to_database()
            except Exception:
                return {
                    'message':
                    'An error occurred saving the image to the database.'
                }, 500

            new_post = create_presigned_post(S3_BUCKET_NAME, identifier)
            presigned_posts.append(new_post)

        pool = ThreadPool(processes=20)
        start = time.time()
        pool.starmap(upload, zip(presigned_posts, uploaded_files))
        end = time.time()
        print(end - start)
        return redirect("/personal/1"), 301
Exemple #8
0
 def get_all_product_from_tiki(self, save_db=False):
     proxies = self.get_proxies()
     self.proxy_pool = cycle(proxies)
     leaf_categories = category.Category.query.filter_by(
         is_leaf_cat=False, is_scraped=False).all()
     pool = ThreadPool(10)
     pool.starmap(self.get_all_products_from_category,
                  zip(leaf_categories, [save_db] * len(leaf_categories)))
def crawl_image(url):
    pool = ThreadPool(multiprocessing.cpu_count())
    song_title, img_urls = get_img_url(url)
    os.mkdir('{}/{}'.format(img_folder, song_title))
    pool.starmap(download_img,enumerate(zip(repeat(song_title),img_urls)))
    pool.close() 
    pool.join()
    return song_title
def crawl_all_content(THREAD_COUNT, start_date, end_date,
                      threads_count_per_date):
    dates = rrule(DAILY, dtstart=start_date, until=end_date)
    pool = ThreadPool(THREAD_COUNT)
    pool.starmap(start_crawl, zip(dates, repeat(threads_count_per_date)))
    # close the pool and wait for the work to finish
    pool.close()
    pool.join()
Exemple #11
0
def run_crawl(max_page=MAX_PAGE, thread_max=5):
    with open(RESULT_SAVE_PATH, 'wt', newline='') as fp:
        writer = csv.writer(fp)
        writer.writerow(('标题', '封面图片地址', '作者', '价格', '评论数', '好评率', '商店名'))
        pool = ThreadPool(thread_max)
        pool.starmap(crawl,
                     [(writer, page) for page in range(1, max_page + 1)])
    pool.close()
    pool.join()
Exemple #12
0
def download_multi(urls_filepaths, overwrite=False):
    global TOTAL
    TOTAL = 0

    pool = ThreadPool(settings.THREAD_LIMIT)
    if overwrite:
        pool.starmap(_download, urls_filepaths)
        return len(urls_filepaths)
    else:
        pool.starmap(_download_if_not_exists, urls_filepaths)
        return TOTAL
Exemple #13
0
 def init(self, export='csv', path='out'):
     # 获取数据
     # 校验完整性
     # 记录出错的数据 / 或者
     # ip = socket.gethostbyname(self.SINA_API_HOSTNAME)
     stock_codes = self.get_all_stock_codes()
     exists_codes = [code[:-4] for code in os.listdir(os.path.join(path, 'raw_data')) if code.endswith('.csv')]
     stock_codes = set(stock_codes).difference(exists_codes)
     pool = ThreadPool(500)
     params = [(code, export, path) for code in stock_codes]
     pool.starmap(self.out_stock_history, params)
Exemple #14
0
 def init(self, export='csv', path='out'):
     # 获取数据
     # 校验完整性
     # 记录出错的数据 / 或者
     # ip = socket.gethostbyname(self.SINA_API_HOSTNAME)
     # self.SINA_API = self.SINA_API % ip
     print(self.SINA_API)
     stock_codes = self.get_all_stock_codes()
     pool = ThreadPool(1)
     params = [(code, export, path) for code in stock_codes]
     pool.starmap(self.out_stock_history, params)
Exemple #15
0
def batch_process(ip_address: str, batch_threads: int, port_from: int,
                  port_to: int, timeout: float, open_ports: list):
    pool = ThreadPool(processes=batch_threads)

    items = []
    for port in range(port_from, port_to):
        items.append((ip_address, port, timeout, open_ports))

    pool.starmap(scan_port, items)
    pool.close()
    pool.join()
def phylogenetic_informativeness(padded_primer_product_path, nex_path,
                                 tapir_out_path, pi_score_path, ref_tree_fn,
                                 tapir_opts_string):
    cpu_count = mp.cpu_count()

    # remove and recreate nex and tapir ouput directories
    shutil.rmtree(nex_path, ignore_errors=True)
    shutil.rmtree(tapir_out_path, ignore_errors=True)
    shutil.rmtree(pi_score_path, ignore_errors=True)
    os.makedirs(pi_score_path, exist_ok=True)

    # create tapir input and output directories for each core
    for i in range(cpu_count):
        os.makedirs(nex_path + '{num:02d}'.format(num=i), exist_ok=True)
        os.makedirs(tapir_out_path + '{num:02d}'.format(num=i), exist_ok=True)

    # for each fasta file, make a nex (split into cpu_count subdirectories)
    i = 0
    convertfasta2nex_input = []
    for file in os.listdir(padded_primer_product_path):
        group = '{num:02d}'.format(num=i % cpu_count)
        padded_primer_product_fn = padded_primer_product_path + file
        nex_fn = nex_path + group + "/" + file + ".nex"
        convertfasta2nex_input.append((padded_primer_product_fn, nex_fn))
        i += 1

    pool = ThreadPool(cpu_count)
    pool.starmap(convertfasta2nex, convertfasta2nex_input)
    pool.close()
    pool.join()

    # for each core, process a subdirectory
    tapir_driver_input = []
    for i in range(cpu_count):
        nex_sub_path = nex_path + '{num:02d}/'.format(num=i)
        tapir_out_sub_path = tapir_out_path + '{num:02d}/'.format(num=i)
        tapir_driver_input.append(
            (nex_sub_path, tapir_out_sub_path, ref_tree_fn, tapir_opts_string))

    pool = ThreadPool(cpu_count)
    pool.starmap(tapir_driver, tapir_driver_input)
    pool.close()
    pool.join()

    # collect sql to a directory
    sql_list = []
    for i in range(cpu_count):
        tapir_out_sub_path = tapir_out_path + '{num:02d}/'.format(num=i)
        sql_list.extend([(tapir_out_sub_path + fn,
                          pi_score_path + '{num:02d}'.format(num=i) + '_' + fn)
                         for fn in os.listdir(tapir_out_sub_path)
                         if ".sqlite" in fn])
    for old_fn, new_fn in sql_list:
        shutil.move(old_fn, new_fn)
Exemple #17
0
 def init(self, export='csv', path='out'):
     # 获取数据
     # 校验完整性
     # 记录出错的数据 / 或者
     # ip = socket.gethostbyname(self.SINA_API_HOSTNAME)
     # self.SINA_API = self.SINA_API % ip
     print(self.SINA_API)
     stock_codes = self.get_all_stock_codes()
     pool = ThreadPool(1)
     params = [(code, export, path) for code in stock_codes]
     pool.starmap(self.out_stock_history, params)
Exemple #18
0
def write_multi(data_filepaths, overwrite=False):
    global TOTAL
    TOTAL = 0

    pool = ThreadPool(settings.THREAD_LIMIT)
    if overwrite:
        pool.starmap(_write, data_filepaths)
        return len(data_filepaths)
    else:
        pool.starmap(_write_if_not_exists, data_filepaths)
        return TOTAL
def suppression_couleur(frame, width, height, hsv_indesirable, aire=5):

    pool = ThreadPool(4)

    pool.starmap(
        thread_suppr_coul,
        zip(itertools.repeat(frame), itertools.repeat(width),
            itertools.repeat(height), itertools.repeat(hsv_indesirable),
            itertools.repeat(aire), [(0, 0), (0, 1), (1, 0), (1, 1)]))
    pool.close()
    pool.join()
    return frame
def _process_queued_parse_task(event, context):
    bullhorn = Bullhorn.retrieve()
    topic_arn = bullhorn.find_task_arn('aio_leech')
    batch = []
    for entry in event['Records']:
        entry_body = rapidjson.loads(entry['body'])
        original_payload = rapidjson.loads(entry_body['Message'])
        original_payload['topic_arn'] = topic_arn
        batch.append((original_payload, context))
    pool = ThreadPool(len(batch))
    pool.starmap(parse_batch_encounters, batch)
    pool.close()
    pool.join()
Exemple #21
0
    def complete_samples(self, low, hi):
        pool = ThreadPool(multiprocessing.cpu_count())
        on = pool.starmap(self.create_sample,
                          [(i, True)
                           for i in range(low, hi + 1) if i not in self.on])
        off = pool.starmap(self.create_sample,
                           [(i, False)
                            for i in range(low, hi + 1) if i not in self.off])

        pool.close()
        pool.join()

        self.on.update({i: j for i, j in on})
        self.off.update({i: j for i, j in off})
Exemple #22
0
    def __traverse_multiprocessors(self,
                                   category_members,
                                   curr_level=1,
                                   max_level=1,
                                   max_sub_categories=4,
                                   max_articles_per_category=None):
        category_members_titles = [
            member.title() for member in category_members
        ]
        top_level_categories_list = []
        for category_title in self.level_1_categories:
            if category_title in category_members_titles:
                top_level_categories_list.append(
                    category_members[category_title])
            else:
                category = wikipediaapi.Wikipedia('en').page(category_title)
                category.ns = wikipediaapi.Namespace.CATEGORY
                top_level_categories_list.append(category)
        top_level_categories_list = top_level_categories_list[:min(
            len(top_level_categories_list), max_sub_categories)]
        top_level_articles = [
            member for member in category_members.values()
            if member.ns == wikipediaapi.Namespace.MAIN
        ]
        top_level_articles = top_level_articles[:min(
            len(top_level_articles), max_articles_per_category)]
        self.add_top_level_categories_and_articles(top_level_categories_list +
                                                   top_level_articles)

        categories_names = [
            member.title.lower() for member in top_level_categories_list
        ]
        num_categories = len(categories_names)
        categories_members_list = [
            member.categorymembers for member in top_level_categories_list
        ]
        parameters = list(
            zip(categories_names, categories_members_list,
                it.repeat(curr_level + 1, num_categories),
                it.repeat(max_level, num_categories),
                it.repeat(max_sub_categories, num_categories),
                it.repeat(max_articles_per_category, num_categories)))
        global lock
        lock = Lock()
        max_simultaneous_threads = cpu_count(
        )  # TODO: unused variable, remove this line ?
        pool = ThreadPool(processes=10)
        pool.starmap(self.traverse_categories_tree_synchronized, parameters)
        pool.close()
        pool.join()
def upload_files_with_thread_pool(upload_function,
                                  bucket_name,
                                  threads_no=8,
                                  meta=None):
    pool = ThreadPool(processes=threads_no)
    arguments = []
    for filename, file_size in FILES.items():
        arguments.append((filename, bucket_name, filename, file_size))
    global_start_time = time.perf_counter()
    pool.starmap(upload_function, arguments)
    global_end_time = time.perf_counter()

    print(
        f"[Thread Pool Upload] Total elapsed time: {global_end_time - global_start_time}"
    )
Exemple #24
0
def main():
    parser = argparse.ArgumentParser("Grabs Gutenberg books by ID from a file")
    parser.add_argument("--idfile", type=str, required=True)
    parser.add_argument("--outdir", type=str, required=True)

    args = parser.parse_args()

    if not os.path.exists(args.idfile):
        raise RuntimeError("idfile not found")

    with open(args.idfile, "r") as infile:
        ids = [(int(line.strip()), args.outdir) for line in infile]

    pool = ThreadPool(80)
    pool.starmap(get_one_book, ids)
Exemple #25
0
def multi_threading(pool_fn,
                    pool_args,
                    disable_multiprocessing=False,
                    dataframe_mode=False):
    """
    Wrap multi threading for external c++ calls.

    Args:
        pool_fn: any partial function that takes a single argument. For multi argument functions reduce it with partial
                 to a single argument. The first argument needs to be the list over which the pool can iterate.
        pool_args (list): list of any type that is passed into the pool.map or map.
        disable_multiprocessing (bool): if set to True, multiprocessing will not be applied, regardless of config.ini entry.
        dataframe_mode (bool): set to true to use starmap, so pd.concat can be used on results,
                               if set to false, the result will be a list of list.

    Returns:
        res (list): Result of multiprocessing. Len of results will match len of the list of the pool_args

    """
    from multiprocessing.pool import ThreadPool
    parallel, cores = get_multiprocessing_config()
    log.debug("Start with parallel={} and cores={}, queue size={}".format(
        parallel, cores, len(pool_args)))
    if parallel and not disable_multiprocessing:
        threadpool = ThreadPool(cores)
        if dataframe_mode:
            res = threadpool.starmap(pool_fn, pool_args)
        else:
            res = threadpool.map(pool_fn, pool_args)
    else:
        res = [pool_fn(x) for x in pool_args]
    assert len(res) == len(pool_args)
    log.debug("Completed.")
    return res
def cvx_transform_QPSDP_solve_batch(Qs, qs, Gs, hs, As, bs, D, eps, xi, delta, T, p=None,
                                    cp_sol = cp.SCS, n_jobs = 1, verbose=False):
    """

    :param Qs:
    :param qs:
    :param Gs:
    :param hs:
    :param As:
    :param bs:
    :param D:
    :param eps:
    :param xi:
    :param delta:
    :param cp_sol:
    :param n_jobs:
    :param verbose:
    :return:
    """

    # prob.value, xhat, GAMMA_hat, lam, lam_sdp, mu, slacks

    if n_jobs == -1:
        n_jobs = mp.cpu_count()
    batch_size = len(As)
    pool = ThreadPool(processes=n_jobs)
    args = []
    for i in range(batch_size):
        args += [(Qs[i], qs[i], Gs[i], hs[i], As[i], bs[i], D[i], eps[i], xi, delta, T, p, cp_sol, verbose)]

    return pool.starmap(forward_cvx_single_d_filter_wrapper, args)
Exemple #27
0
def frame_blobs_parallel(dert__):
    '''
    Draft of parallel blobs forming process, consists
    '''

    pool = ThreadPool(mp.cpu_count())  # initialize pool of threads

    height, width = dert__[0].shape  # height and width of image

    # generate all x and y coordinates
    dert_coord = [[y, x] for x in range(width) for y in range(height)]
    y_, x_ = zip(*[[y, x] for y, x in dert_coord])

    # get each non class instance dert from coordinates
    dert_ = [dert__[:, y, x] for y, x in dert_coord]

    # (parallel process) generate instance of derts and blobs from their class
    blob_, dert_ = zip(*pool.starmap(generate_blobs, zip(dert_, y_, x_)))

    get_rim_dert(dert_, height, width)  # get rim per dert

    cycle_count = 0
    f_cycle = 1  # flag to continue cycle, 0 = stop, 1 = continue

    ## 1st cycle ##
    blob_, id_map__ = extension_cycle(pool, blob_, height, width)
    id_map_prior__ = id_map__  # prior id_map, to check when to stop iteration

    # save output image
    cv2.imwrite("./images/parallel/id_cycle_0.png",
                (((id_map__) * 255) / (width * height)).astype('uint8'))

    while f_cycle:

        print("Running cycle " + str(cycle_count + 1))
        ## consecutive cycles ##
        blob_, id_map__ = extension_cycle(pool, blob_, height, width)
        # check if ids changed:
        dif = id_map__ - id_map_prior__
        # update id map
        id_map_prior__ = id_map__
        # if no change in ids, stop the iteration:
        if (np.sum(dif) == 0):
            f_cycle = 0
        # save image
        cv2.imwrite(
            "./images/parallel/id_cycle_" + str(cycle_count + 1) + ".png",
            (((id_map__) * 255) / (width * height)).astype('uint8'))

        cycle_count += 1

    accumulate_blob_(blob_)  # accumulate dert param into their blob

    print("total cycle= " + str(cycle_count))

    # close pool of threads
    pool.close()
    pool.join()

    return blob_, id_map__
Exemple #28
0
    def pull(self,
             x: Union[str, List[str]],
             quiet: bool = False) -> Union[Image, List[Image]]:
        """Pull one or more docker image(s)

        Alias: `docker.pull(...)`

        # Arguments
            x: The image name(s) . Can be a string or a list of strings. In case of
                list, multithreading is used to pull the images.
                The progress bars might look strange as multiple
                processes are drawing on the terminal at the same time.
            quiet: If you don't want to see the progress bars.

        # Returns:
            The Docker image loaded (`python_on_whales.Image` object).
            If a list was passed as input, then a `List[python_on_whales.Image]` will
            be returned.
        """

        if x == []:
            return []
        elif isinstance(x, str):
            return self._pull_single_tag(x, quiet=quiet)
        elif isinstance(x, list) and len(x) == 1:
            return [self._pull_single_tag(x[0], quiet=quiet)]
        elif len(x) >= 2:
            pool = ThreadPool(4)
            generator = self._generate_args_push_pull(x, quiet)
            all_images = pool.starmap(self._pull_single_tag, generator)
            pool.close()
            pool.join()
            return all_images
Exemple #29
0
def bulkBatch(
    symbols,
    fields=None,
    range_="1m",
    last=10,
    token="",
    version="",
    filter="",
    format="json",
):
    """Optimized batch to fetch as much as possible at once

    https://iexcloud.io/docs/api/#batch-requests


    Args:
        symbols (list): List of tickers to request
        fields (list): List of fields to request
        range_ (str): Date range for chart
        last (int):
        token (str): Access token
        version (str): API version
        filter (str): filters: https://iexcloud.io/docs/api/#filter-results
        format (str): return format, defaults to json

    Returns:
        dict: results in json
    """
    fields = fields or _BATCH_TYPES
    args = []
    empty_data = []
    list_orig = empty_data.__class__

    if not isinstance(symbols, list_orig):
        raise PyEXception("Symbols must be of type list")

    for i in range(0, len(symbols), 99):
        args.append(
            (symbols[i : i + 99], fields, range_, last, token, version, filter, format)
        )

    pool = ThreadPool(20)
    rets = pool.starmap(batch, args)
    pool.close()

    ret = {}

    for i, d in enumerate(rets):
        symbols_subset = args[i][0]
        if len(d) != len(symbols_subset):
            empty_data.extend(list_orig(set(symbols_subset) - set(d.keys())))
        ret.update(d)

    for k in empty_data:
        if k not in ret:
            if isinstance(fields, str):
                ret[k] = {}
            else:
                ret[k] = {x: {} for x in fields}
    return ret
Exemple #30
0
def bulkBatch(symbols, types=None, _range='1m', last=10):
    '''fetch a large number of fields for multiple symbols all at the same time'''
    types = types or _BATCH_TYPES
    args = []
    empty_data = []
    list_orig = empty_data.__class__

    if not isinstance(symbols, list_orig):
        raise PyEXception('Symbols must be of type list')

    for i in range(0, len(symbols), 99):
        args.append((symbols[i:i + 99], types, _range, last))

    pool = ThreadPool(20)
    rets = pool.starmap(batch, args)
    pool.close()

    ret = {}

    for i, d in enumerate(rets):
        symbols_subset = args[i][0]
        if len(d) != len(symbols_subset):
            empty_data.extend(list_orig(set(symbols_subset) - set(d.keys())))
        ret.update(d)

    for k in empty_data:
        if k not in ret:
            if isinstance(types, str):
                ret[k] = {}
            else:
                ret[k] = {x: {} for x in types}
    return ret
Exemple #31
0
    def extract_features_folder(self):
        """
        Method for extracting features for all images in a folder
        :return:
        """
        file_names = sorted(
            glob.glob1(self.folder, '*' + self.constants.JPG_EXTENSION))
        length = len(file_names)
        mongo_wrapper = mongo.MongoWrapper()
        # Dropping the collection before bulk inserting
        mongo_wrapper.drop_collection(self.model.lower())
        if self.model == self.constants.SIFT:
            mongo_wrapper.drop_collection(
                mongo_wrapper.constants.SIFT_FEATURE_COLLECTION.lower())

        for i in range(0, length, self.constants.BULK_PROCESS_COUNT):
            pool = ThreadPool(self.constants.NUM_THREADS)
            mongo_wrapper.bulk_insert(
                self.model.lower() if self.model != self.constants.SIFT else
                mongo_wrapper.constants.SIFT_FEATURE_COLLECTION,
                pool.starmap(
                    getattr(ExtractFeatures, 'extract_' + self.model.lower()),
                    [(self, i, True) for i in file_names[i:length]]
                    if i + self.constants.BULK_PROCESS_COUNT > length else [
                        (self, i, True)
                        for i in file_names[i:i +
                                            self.constants.BULK_PROCESS_COUNT]
                    ]))
        if self.model == self.constants.SIFT:
            print('Processing Data for {}'.format(self.model))
            self.create_bog_histogram(overwrite=True)
Exemple #32
0
 def init(self, export='csv', path='out'):
     # 获取数据
     # 校验完整性
     # 记录出错的数据 / 或者
     # ip = socket.gethostbyname(self.SINA_API_HOSTNAME)
     # self.SINA_API = self.SINA_API % ip
     print(self.SINA_API)
     stock_codes = self.get_all_stock_codes()
     exists_codes = [
         code[:-4] for code in os.listdir(os.path.join(path, 'raw_data'))
         if code.endswith('.csv')
     ]
     stock_codes = set(stock_codes).difference(exists_codes)
     pool = ThreadPool(500)
     params = [(code, export, path) for code in stock_codes]
     pool.starmap(self.out_stock_history, params)
Exemple #33
0
 def init(self, export='csv', path='out'):
     path = os.path.join(path, 'day')
     self.result_path = os.path.join(path, 'data')
     self.raw_path = os.path.join(path, 'raw_data')
     if not os.path.exists(self.result_path):
         os.makedirs(self.result_path)
     if not os.path.exists(self.raw_path):
         os.makedirs(self.raw_path)
     stock_codes = self.get_all_stock_codes()
     if os.path.exists(os.path.join(path, 'raw_data')):
         exists_codes = [code[:-4] for code in os.listdir(os.path.join(path, 'raw_data')) if code.endswith('.csv')]
     else:
         exists_codes = set()
     stock_codes = set(stock_codes).difference(exists_codes)
     pool = ThreadPool(100)
     params = [(code, export, path) for code in stock_codes]
     pool.starmap(self.out_stock_history, params)
Exemple #34
0
from multiprocessing.pool import ThreadPool

import time


def get_param():
    paramList = []
    for i in range(5000):
        index = str(i)
        thread_num = 'thread-' + str(i)
        param = (index, thread_num)
        paramList.append(param)
    return paramList


def process_operator(index, thread_num):
    print(thread_num, "say: i am", index)


if __name__ == '__main__':
    s_time = time.time()
    pool = ThreadPool(10)
    paramList = get_param()
    print(paramList)
    pool.starmap(process_operator, paramList)
    e_time = time.time()
    total_time = e_time - s_time
    print(total_time)


Exemple #35
0
        response = requests.post('https://www.ihg.com/gs-json/cn/zh/login',json=auth_json)
        return json.loads(str(response.content, encoding='utf-8'))
    except Exception as e:
        print('发生{} 跳过账号 {}'.format(e.__traceback__,account))
        return None


def spider_task(account_start,account_end,pwd):
    for account in range(account_start,account_end):
        response = _request(str(account),str(pwd))
        if response:
            _parseAccountDict(response)

if __name__ == '__main__':
    pool = ThreadPool(5)
    startAccount = int(input('输入 起始账号\n'))
    endAccount = int(input('输入 结束账号\n'))
    pwd = input('输入测试密码\n')
    # 测试代码
    # startAccount = 324577564
    # endAccount = 324577570
    # pwd = '0822'
    accountSize = endAccount - startAccount
    partLen = accountSize//5
    args_map =[]
    actor.start()
    for i in range(5):
        args_map.append((startAccount+i*partLen,startAccount+(i+1)*partLen,pwd))
    pool.starmap(spider_task,args_map)
    print('测试中 ...')