Ejemplo n.º 1
0
    def __init__(self, model=None, tokenizer=None,
                 embedding_file=None, num_workers=None):
        """
        Args:
            model: path to saved model file
            tokenizer: option string to select tokenizer class
            embedding_file: if provided, will expand dictionary to use all
              available pretrained vectors in this file.
            num_workers: number of CPU processes to use to preprocess batches.
        """
        logger.info('Initializing model...')
        self.model = DocReader.load(model or DEFAULTS['model'])

        if embedding_file:
            logger.info('Expanding dictionary...')
            words = utils.index_embedding_words(embedding_file)
            added = self.model.expand_dictionary(words)
            self.model.load_embeddings(added, embedding_file)

        logger.info('Initializing tokenizer...')
        annotators = tokenizers.get_annotators_for_model(self.model)
        if not tokenizer:
            tokenizer_class = DEFAULTS['tokenizer']
        else:
            tokenizer_class = tokenizers.get_class(tokenizer)

        if num_workers is None or num_workers > 0:
            self.workers = ProcessPool(
                num_workers,
                initializer=init,
                initargs=(tokenizer_class, annotators),
            )
        else:
            self.workers = None
            self.tokenizer = tokenizer_class(annotators=annotators)
Ejemplo n.º 2
0
def calculateSystemRanks_multiprocessing(systemList, systemAddress,
                                         relevanceJudgementAddress,
                                         rankMetric):
    num_workers = None
    workers = ProcessPool(processes=20)
    system_metric_value = {}  # key is the system_name
    #print systemList
    with tqdm(total=len(systemList)) as pbar:
        partial_calculateSystemRanks = partial(
            calculateSystemRanks,
            systemAddress=systemAddress,
            relevanceJudgementAddress=relevanceJudgementAddress,
            rankMetric=rankMetric)
        for system_info in tqdm(
                workers.imap_unordered(partial_calculateSystemRanks,
                                       systemList)):
            system_name = system_info[0]
            system_metric_val = system_info[1]
            system_metric_value[system_name] = system_metric_val
            pbar.update()
    workers.close()
    workers.join()
    system_metric_value_list = []
    for system_name in sorted(system_metric_value.iterkeys()):
        system_metric_value_list.append(system_metric_value[system_name])
    return (system_metric_value, system_metric_value_list)
Ejemplo n.º 3
0
 def __init__(self, PoolType):  # PoolType是取值为'Process'或'Thread'的字符串
     name = 'MyProblem'  # 初始化name(函数名称,可以随意设置)
     M = 1  # 初始化M(目标维数)
     maxormins = [1]  # 初始化maxormins(目标最小最大化标记列表,1:最小化该目标;-1:最大化该目标)
     Dim = 4  # 初始化Dim(决策变量维数)
     varTypes = [1, 1, 0, 0]  # 初始化varTypes(决策变量的类型,元素为0表示对应的变量是连续的;1表示是离散的)
     lb = [1, 1, 0, 0]  # 决策变量下界
     ub = [10000, 2000, 1, 1]  # 决策变量上界
     lbin = [0] * Dim  # 决策变量下边界(0表示不包含该变量的下边界,1表示包含)
     ubin = [1] * Dim  # 决策变量上边界(0表示不包含该变量的上边界,1表示包含)
     # 调用父类构造方法完成实例化
     ea.Problem.__init__(self, name, M, maxormins, Dim, varTypes, lb, ub,
                         lbin, ubin)
     # 目标函数计算中用到的一些数据
     X_train, X_test, Y_train, Y_test = get_data(hour_num=0,
                                                 transform='sin+cos',
                                                 drop_time=True,
                                                 scale=True)
     np.where(X_train == 0, X_train, 0.001)
     np.where(Y_train == 0, Y_train, 0.001)
     self.data = X_train  # 训练集的特征数据(归一化)
     self.dataTarget = Y_train
     # 设置用多线程还是多进程
     self.PoolType = PoolType
     if self.PoolType == 'Thread':
         self.pool = ThreadPool(2)  # 设置池的大小
     elif self.PoolType == 'Process':
         num_cores = int(mp.cpu_count())  # 获得计算机的核心数
         self.pool = ProcessPool(num_cores)  # 设置池的大小
Ejemplo n.º 4
0
def calculate_matches(all_docs: Dict[Text, Tuple[Text, Text]],
                      closest_docs: List[Tuple[List[Text], np.ndarray]],
                      answers: List[List[Text]], worker_num: int):
    global dpr_all_documents
    dpr_all_documents = all_docs

    tok_opts = {}
    tokenizer = SimpleTokenizer(**tok_opts)

    processes = ProcessPool(processes=worker_num)
    get_score_partial = partial(check_answer, tokenizer=tokenizer)

    closest_ids = [doc[0] for doc in closest_docs]
    answers_and_retrieved_docs = zip(answers, closest_ids)

    scores = processes.map(get_score_partial, answers_and_retrieved_docs)

    n_docs = len(closest_docs[0][0])
    top_k_hits = [0] * n_docs
    for question_hits in scores:
        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
        if best_hit is not None:
            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]

    return QAMatchStats(top_k_hits, scores)
Ejemplo n.º 5
0
def store_contents(data_path, save_path, preprocess, num_workers=None):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        preprocess: Path to file defining a custom `preprocess` function. Takes
          in and outputs a structured doc.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    logger.info('Reading into database...')
    conn = sqlite3.connect(save_path)
    c = conn.cursor()
    c.execute("CREATE TABLE documents (id PRIMARY KEY, title, text);")

    workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,))
    files = [f for f in iter_files(data_path)]
    count = 0
    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            count += len(pairs)
            c.executemany("INSERT INTO documents VALUES (?,?,?)", pairs)
            pbar.update()
    logger.info('Read %d docs.' % count)
    logger.info('Committing...')
    conn.commit()
    conn.close()
Ejemplo n.º 6
0
def ping_scan(network):
    # 多进程
    pool = ProcessPool(processes=150)
    net = ipaddress.ip_network(network)

    result_obj_dict = {}

    for ip in net:
        # 获取返回的对象
        result_obj = pool.apply_async(scapy_ping_one, args=(str(ip), ))
        result_obj_dict[str(ip)] = result_obj

    pool.close()
    pool.join()

    # print(result_obj_dict)

    active_ip = []

    for ip, obj in result_obj_dict.items():
        # print(obj.get())
        if obj.get()[1] == 1:
            active_ip.append(ip)

    # print(active_ip)
    return active_ip
    def batchFeedDownload(self, feedPack: list, procs: int) -> list:
        """
        Downloads collection of feeds in parallel processes
        :param feedsPack: Feed data
        :param proc: Number of parallel processes to get data over different feeds
        """

        logger.info('Download started')

        if procs == 1:

            downloadStartTime = datetime.now()
            feedData: list = []

            # Iterate over feed links and download feeds
            for link in feedPack:
                feedData.append(self.getFeed(link))

            downloadEndTime = datetime.now()
            downloadTime = downloadEndTime - downloadStartTime

            return feedData

        elif procs > 1:

            # Log download start time
            downloadStartTime = datetime.now()
            feedData: list = []

            # Define process pool and start download feeds from feedPack in parallel processes
            pool = ProcessPool(procs)

            # Download feeds in a number of separate processes
            feedData = pool.map(self.getFeed, feedPack)

            # Log download end time
            downloadEndTime = datetime.now()
            downloadTime = downloadEndTime - downloadStartTime

            # Calcuate total feeds size
            totalFeedsSize: int = 0

            for dictItem in feedData:
                totalFeedsSize += dictItem['feedSize']

            # Log results
            logger.info(
                'Successfully downloaded {0} feeds of {1} Kbytes in {2} seconds {3} msec'
                .format(
                    len(feedPack),
                    round(totalFeedsSize, 1),
                    downloadTime.seconds,
                    downloadTime.microseconds,
                ), )

            pool.close()
            pool.join()

            return feedData
Ejemplo n.º 8
0
def _crawl_user(user_id):
    """ 查询user的画板 """
    if not user_id:
        return
    user_url = BASE_URL + "/{}".format(user_id)
    limit = 5
    try:
        # get first board data
        r = request.get(user_url).json()
    except requests.ConnectionError:
        request.headers.update({"User-Agent": choice(user_agent_list)})
        r = request.get(user_url).json()
    except Exception as e:
        printcolor("Crawl first page error, user_id: {}".format(user_id),
                   "yellow")
        logging.error(e, exc_info=True)
    else:
        if "user" in r:
            user_data = r["user"]
        else:
            printcolor(r.get("msg"))
            return
        board_number = int(user_data['board_count'])
        retry = 2 * board_number / limit
        board_ids = user_data['boards']
        printcolor(
            "Current user <{}> boards number is {}, first boards number is {}".
            format(user_id, board_number, len(board_ids)), 'red')
        if len(board_ids) < board_number:
            last_board = user_data['boards'][-1]['board_id']
            while 1 <= retry:
                # get ajax pin data
                user_next_url = BASE_URL + \
                    "/{}?jhhft3as&max={}&limit={}&wfl=1".format(
                        user_id, last_board, limit)
                try:
                    user_next_data = request.get(user_next_url).json()["user"]
                except Exception as e:
                    logging.error(e, exc_info=True)
                    continue
                else:
                    board_ids += user_next_data["boards"]
                    printcolor(
                        "ajax load user with board_id {}, get boards number is {}, merged"
                        .format(last_board,
                                len(user_next_data["boards"])), "blue")
                    if len(user_next_data["boards"]) == 0:
                        break
                    last_board = user_next_data["boards"][-1]["board_id"]
                retry -= 1
                # 减轻访问频率
                sleep(SLEEP_TIME)
        board_ids = map(str, [board['board_id'] for board in board_ids])
        pool = ProcessPool()  # 创建进程池
        # board_ids:要处理的数据列表; _crawl_board:处理列表中数据的函数
        pool.map(_crawl_board, board_ids)
        pool.close()  # 关闭进程池,不再接受新的进程
        pool.join()  # 主进程阻塞等待子进程的退出
        printcolor("Current user {}, download over".format(user_id), "green")
Ejemplo n.º 9
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        #doc_ids = doc_db.get_doc_ids()
     doc_ids = [] 
    res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m')
    scroll = res['_scroll_id']
    #logger.info(scroll)
    #for doc in res['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])     
    #res2 = es.scroll(scroll_id = scroll, scroll = '1m')   
    
    #for doc in res2['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])   
    
    scroll_id = res['_scroll_id']
    for ref in scrollr(es, scroll_id, extract_references):
        print(ref)
        doc_ids.append(ref)
           
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Ejemplo n.º 10
0
 def _open(self) -> None:
     """Open pools"""
     max_processes = cpu_count(logical=False)
     # Reserve one CPU for I/O bound tasks
     if max_processes > 2:
         max_processes = max_processes - 1
     self.__process_pool = ProcessPool(max_processes)
     self.__thread_pool = ThreadPool(max_processes)
def process_pool(num):
    p = ProcessPool(num)
    start_time = time.time()
    ret = p.map(run, range(max_range))
    p.close()
    p.join()
    print("process_pool %d, costTime: %fs ret.size: %d" %
          (num, (time.time() - start_time), len(ret)))
Ejemplo n.º 12
0
def get_comic_index():
    urlpool = [url+str(i)+'/' for url in URLs for i in range(1,470)]
    print("Crawl index: ", ctime())
    pool = ProcessPool(PROCESSNUMS)
    pool.map(_get_comic_index, urlpool)
    pool.close()
    pool.join
    print('Finished:', ctime())
def run_processpool(some_function, list_of_args, threads):
    pool = ProcessPool(threads)
    #pass the function that we will be multiprocessing along with args per process
    results = pool.map(some_function, list_of_args)
    pool.close()
    #complete the process with join
    pool.join()
    return results
Ejemplo n.º 14
0
def iterate_list_with_parallel(f, data_length, thread):
    p = ProcessPool(thread)  # Ten Threads
    index_list = [(int(idx * data_length / thread),
                   int((idx + 1) * data_length / thread))
                  for idx in range(thread)]
    p.map(f, index_list)
    p.close()
    p.join()
Ejemplo n.º 15
0
    def __init__(self, M, file_dir_sce, file_dir_data, file_dir_eval,
                 configure):

        name = 'AdaptObj'  # 初始化name(函数名称,可以随意设置)
        maxormins = [1] * M  # 初始化maxormins(目标最小最大化标记列表,1:最小化该目标;-1:最大化该目标)

        # Dim = 19  # 初始化Dim(决策变量维数)
        # varTypes = [  0,  0,   0,   0,  0, 0,   0,   0,  0, 0,  0,   0,  0, 0,   0,   0,  1, 1, 1] # 初始化varTypes(决策变量的类型,0:实数;1:整数)
        # lb =       [150,  8,  20, 212,  4, 0,  20, 205,  4, 0,  1, 150,  4, 0,   0,  50, 10, 1, 4]  # 决策变量下界
        # ub =       [180, 16, 100, 217, 16, 3, 120, 210, 16, 3,  6, 250, 16, 3, 240, 250, 20, 3, 6] # 决策变量上界
        # lbin = [1] * Dim  # 决策变量下边界(0表示不包含该变量的下边界,1表示包含)
        # ubin = [1] * Dim  # 决策变量上边界(0表示不包含该变量的上边界,1表示包含)

        self.config = configure

        Dim = 16  # 初始化Dim(决策变量维数)
        # varTypes = [  0,  0,   0,   0,  0, 0,   0,   0,  0, 0,  0,   0,  0, 0,   0,   0,  1, 1, 1] # 初始化varTypes(决策变量的类型,0:实数;1:整数)
        lb = [
            self.config.ego_s0[0], self.config.ego_v0[0],
            self.config.start_s[0], self.config.end_s[0],
            self.config.green_time[0], self.config.yellow_time[0],
            self.config.red_time[0], self.config.pos_s[0],
            self.config.pos_s[0], self.config.pos_y_1[0],
            self.config.velo_1[0], self.config.acc_1[0],
            self.config.pos_y_1[0], self.config.velo_1[0],
            self.config.acc_1[0], self.config.start_time_2[0]
        ]  # 决策变量下界
        ub = [
            self.config.ego_s0[1], self.config.ego_v0[1],
            self.config.start_s[1], self.config.end_s[1],
            self.config.green_time[1], self.config.yellow_time[1],
            self.config.red_time[1], self.config.pos_s[1],
            self.config.pos_s[1], self.config.pos_y_1[1],
            self.config.velo_1[1], self.config.acc_1[1],
            self.config.pos_y_1[1], self.config.velo_1[1],
            self.config.acc_1[1], self.config.start_time_2[1]
        ]  # 决策变量上界
        Dim = len(lb)
        # print(Dim)
        varTypes = [0] * Dim
        lbin = [1] * Dim  # 决策变量下边界(0表示不包含该变量的下边界,1表示包含)
        ubin = [1] * Dim  # 决策变量上边界(0表示不包含该变量的上边界,1表示包含)

        self.file_dir_sce = file_dir_sce
        self.file_dir_data = file_dir_data
        self.file_dir_eval = file_dir_eval
        # 调用父类构造方法完成实例化
        ea.Problem.__init__(self, name, M, maxormins, Dim, varTypes, lb, ub,
                            lbin, ubin)

        # 设置用多线程还是多进程
        self.PoolType = configure.PoolType

        if self.PoolType == 'Thread':
            self.pool = ThreadPool(4)  # 设置池的大小
        elif self.PoolType == 'Process':
            num_cores = int(mp.cpu_count())  # 获得计算机的核心数
            self.pool = ProcessPool(num_cores)  # 设置池的大小
Ejemplo n.º 16
0
def process_pool(num):
    p = ProcessPool(num)
    start_time = time.time()
    data = zip(['a', 'b', 'c'], [1, 3, 6])
    ret = p.map(run, data)
    p.close()
    p.join()
    print("process_pool %d, costTime: %fs ret.size: %d" %
          (num, (time.time() - start_time), len(ret)))
Ejemplo n.º 17
0
def getDataAsync3(baseurl: str, process_num=os.cpu_count()) -> List[List]:
    """多进程 异步方式爬取数据"""
    datalist = []
    urls = [baseurl + str(i * 25) for i in range(10)]
    with ProcessPool(process_num) as pool:
        htmls = pool.map(askUrl, urls)
    for html in htmls:
        datalist.extend(parseData(html))
    return datalist
Ejemplo n.º 18
0
def main():
    data_files = find_paths(args.input_dir)
    logger.info(f'{len(data_files)} data files found.')
    if len(data_files) == 0:
        return
    workers = ProcessPool(args.num_workers)
    with tqdm(total=len(data_files)) as pbar:
        for _ in tqdm(workers.imap_unordered(process_tweets_file, data_files)):
            pbar.update()
Ejemplo n.º 19
0
 def sPool(callback, tnum=20, cnum='', arg=[]):
     from multiprocessing import Pool as ProcessPool  # 进程池
     from multiprocessing import cpu_count  #cpu数量
     if cnum == '':
         spool = ProcessPool(cpu_count())
     else:
         spool = ProcessPool(cnum)
     arr = list(
         map(
             lambda i: {
                 'cnum': i,
                 'tnum': tnum,
                 'callback': callback,
                 'arg': arg
             }, range(cnum)))
     spool.map(fleader.bPool, arr)
     spool.close()
     spool.join()
Ejemplo n.º 20
0
def stream_corpus(data_path, dictionary, files, num_workers=None):
    workers = ProcessPool(num_workers)
    #files = [f for f in iter_files(data_path)]

    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            yield dictionary.doc2bow(pairs[0][1].split(
            ))  # pairs[0][0]-->docId, pairs[0][1]-->documentContent
            pbar.update()
Ejemplo n.º 21
0
 def _start_process(self, more_sample_content):
     cpu = os.cpu_count()
     pool = ProcessPool(cpu)
     index_batch, sample_batch = corpus_cut(more_sample_content, cpu)
     result = [pool.apply_async(self._process_task, args=(index_batch[i], sample_batch[i])) for i in range(cpu)]
     pool.close()
     pool.join()
     result = map(lambda item: item[1], sorted([item for res in result for item in res.get()],
                                               key=lambda item: item[0]))
     return list(result)
Ejemplo n.º 22
0
def active_learning(topic_list, al_protocol, al_classifier, document_collection, topic_seed_info, topic_complete_qrels_address,train_per_centage, data_path, file_name, use_pooled_budget, per_topic_budget_from_trec_qrels):
    num_workers = None
    workers = ProcessPool(processes = 1)
    with tqdm(total=len(topic_list)) as pbar:
        partial_active_learning_multi_processing = partial(active_learning_multi_processing, al_protocol=al_protocol, al_classifier = al_classifier, document_collection=document_collection,topic_seed_info=topic_seed_info,topic_complete_qrels_address=topic_complete_qrels_address,train_per_centage=train_per_centage, use_pooled_budget=use_pooled_budget, per_topic_budget_from_trec_qrels=per_topic_budget_from_trec_qrels)
        for topic_all_info in tqdm(workers.imap_unordered(partial_active_learning_multi_processing, topic_list)):
            topicId = topic_all_info[0][0] # 0 is the loopCounter Index and 0 is the first tuple
            file_complete_path = data_path + file_name + str(topicId) + ".pickle"
            pickle.dump(topic_all_info, open(file_complete_path, 'wb'))
            pbar.update()
Ejemplo n.º 23
0
def get_map_fn(num_procs, use_threads=False):
    if num_procs == 1:
        return map
    if use_threads:
        if num_procs not in MT_POOLS:
            MT_POOLS[num_procs] = ThreadPool(num_procs)
        return MT_POOLS[num_procs].map
    if num_procs not in MP_POOLS:
        MP_POOLS[num_procs] = ProcessPool(num_procs)
    return MP_POOLS[num_procs].map
Ejemplo n.º 24
0
def calc_factorials(max_int=100, pool_size=8, threads=True, chunk_size=10):

    if threads:
        pool = ThreadPool(pool_size)
    else:
        pool = ProcessPool(pool_size)

    results = pool.imap_unordered(factorial_calc, range(max_int), chunk_size)

    return results
Ejemplo n.º 25
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()[:args.num_docs]
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    unigrams, bigrams = [], []
    hash2gram = {}
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered(
                _count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            unigrams.extend(b_unigrams)
            bigrams.extend(b_bigrams)
            hash2gram.update(b_hash2gram)
    workers.close()
    workers.join()

    unigrams = list(set(unigrams))
    bigrams = list(set(bigrams))

    logger.info('Creating sparse matrix...')

    count_matrix = None
    if args.matrix_type == 'csr':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    elif args.matrix_type == 'csc':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
Ejemplo n.º 26
0
def get_source_proxies():
    get_config = GetConfig()
    # 获取免费代理的所有方法名字
    proxy_func_names = get_config.get_proxy_func()
    # 多进程获取免费代理
    pool = ProcessPool(4)
    result = pool.map(run_cls_func, proxy_func_names)
    pool.close()
    pool.join()
    # 将[[proxy, proxy, ...], [proxy, proxy, ...]]返回
    return result
Ejemplo n.º 27
0
def _map_multiprocess(func, iterable, chunksize=1):
    # type: (Callable[[S], T], Iterable[S], int) -> Iterator[T]
    """Chop iterable into chunks and submit them to a process pool.

    For very long iterables using a large value for chunksize can make
    the job complete much faster than using the default value of 1.

    Return an unordered iterator of the results.
    """
    with closing(ProcessPool()) as pool:
        return pool.imap_unordered(func, iterable, chunksize)
Ejemplo n.º 28
0
def mk_process(proxies, url, jobs, processnum, threadnum, method, post):
    Processpools = ProcessPool(processnum)
    for i in range(processnum):
        Processpools.apply_async(mk_threading,
                                 args=(proxies[i], url, jobs, threadnum,
                                       method, post))
    print 'CC start...'
    print 'Waiting for all subprocesses done...'
    Processpools.close()
    Processpools.join()
    print 'All subprocesses done.'
Ejemplo n.º 29
0
def main(opts):
    files = sorted(glob.iglob(opts.pattern))
    memc_conf = {'idfa': opts.idfa,
                 'gaid': opts.gaid,
                 'adid': opts.adid,
                 'dvid': opts.dvid}
    pool = ProcessPool(processes=FILES_PROCESSING_POOL_SIZE)
    load_file_fn = partial(load_file, threads_count=LOAD_FILE_THREADS_COUNT, memc_conf=memc_conf, dry_run=opts.dry)

    for file_name in pool.imap(load_file_fn, files):
        dot_rename(file_name)
Ejemplo n.º 30
0
def refresh_parallelized(list_cubes, size_pool=5, multiprocessing=True):
    if list_cubes:
        if multiprocessing:
            pool = ProcessPool(size_pool)
        else:
            pool = ThreadPool(size_pool)
        cubes_list = list()
        segments_list = list()
        for cubes, segments in list_cubes:
            cubes_list.append(cubes)
            segments_list.append(segments)
        pool.map(refresh_star, itertools.izip())