Python JoinableQueue.qsizeの例、multiprocessing.JoinableQueue.qsize Pythonの例

コード例 #1

0

ファイルを表示

class Thread_Pool_Manager(object):
    def __init__(self, thread_num=cpu_count()):
        self.thread_num = thread_num
        print(thread_num)
        self.work_queue = JoinableQueue()
        self.work_num = Semaphore(0)
        self.mutex = Lock()

    def start_threads(self):
        for i in range(self.thread_num):
            thread = Process(target=self.do_job)
            thread.daemon = True  # set thread as daemon
            thread.start()

    def do_job(self):
        global Numbers
        while True:
            # print(1)
            self.work_num.acquire()
            with self.mutex:
                print(1, self.work_queue.qsize())
                thread_job = self.work_queue.get()
                print(0, self.work_queue.qsize())
            thread_job.do_job(self.work_queue, self.work_num)
            print(self.work_queue.qsize())
            self.work_queue.task_done()

    def join(self):
        self.work_queue.join()

    def add_job(self, job):
        self.work_queue.put(job)
        self.work_num.release()

コード例 #2

0

ファイルを表示

 def next_train_parallel(self, img_len, n_workers):
     # depricated
     # create multiprocessing overhead
     from multiprocessing import JoinableQueue, Process
     q_send, q_recive = JoinableQueue(),JoinableQueue()
     # send some jobs to queue
     workers = [Process(target=get_batch_parallel, args=(q_send, q_recive,self.n_channels,self.img_w,self.img_h,img_len,
                                                         self.train_imgs, self.train_real_text, self.test_imgs, self.test_real_text,
                                                         self.is_nrc, self.absolute_max_string_len,)) for n in range(n_workers)]
     # start all processes
     [i.start() for i in workers]
     while True:
         # check if need to send data
         if q_recive.empty() or q_send.qsize() < n_workers:
             for _ in range(n_workers - q_send.qsize()):
                 q_send.put([self.cur_test_index, self.minibatch_size, False])
                 if self.cur_test_index + self.minibatch_size > len(self.test_imgs):
                     self.cur_test_index = 0
                     # reshuffle does inplace
                     reshuffle_index = np.arange(len(self.test_imgs))
                     np.random.shuffle(reshuffle_index)
                     self.test_imgs = self.test_imgs[reshuffle_index].ravel()
                     self.test_real_text = self.test_real_text[reshuffle_index].ravel()
         ret = q_recive.get()
         yield ret

コード例 #3

0

ファイルを表示

ファイル: torcrawler.py プロジェクト: simonemainardi/tortools

 def load_urls(self, **kwargs):
     urls = ["https://www.atagar.com/echo.php"] * 100
     urls_queue = Queue()
     for url in urls:
         filename = os.path.join(self.results_directory, "doc_%03d.txt" % (urls_queue.qsize() + 1))
         urls_queue.put((url, filename))
     self.urls_queue = urls_queue
     self.num_urls = int(urls_queue.qsize())

コード例 #4

0

ファイルを表示

ファイル: kern3Dv05.py プロジェクト: lwwalker/kloman3D

def queueManager(numProc, myList, function, *args):
	'''queueManager(numProc, myList, function, *args):
	generic function used to start worker processes via the multiprocessing Queue object
	numProc - number of processors to use
	myList - a list of objects to be iterated over
	function - target function
	*args - additional arguments to pass to function

	Return - an unordered list of the results from myList
	'''
	qIn = Queue()
	qOut = JoinableQueue()
	if args:
		arguments = (qIn, qOut,) + args
	else:
		arguments = (qIn, qOut,)
	results = []
	
	# reduce processer count if proc count > files
	
	i = 0
	for l in myList:
		qIn.put((i,l))
		i += 1

	for _ in range(numProc):
		p = Process(target = function, args = arguments).start()
	sys.stdout.write("Progress: {:>3}%".format(0)
)
	curProgress = 0
	lastProgress = 0
	while qOut.qsize() < len(myList):
		#sys.stdout.write("\b\b\b\b{:>3}%".format(int(ceil(100*qOut.qsize()/len(myList)))))
		curProgress = int(ceil(100*qOut.qsize()/len(myList)))
		if curProgress - lastProgress > 10:
			lastProgress += 10
			sys.stdout.write("\nProgress: {:>3}%".format(lastProgress))
			sys.stdout.flush()
	sys.stdout.write("\nProgress: {:>3}%".format(100))
	#sys.stdout.write("\b\b\b\b{:>3}%".format(100))
	sys.stdout.write("\n")
	for _ in range(len(myList)):
		# indicate done results processing
		results.append(qOut.get())
		qOut.task_done()
	#tell child processes to stop
	for _ in range(numProc):
		qIn.put('STOP')

	orderedRes = [None]*len(results)
	for i, res in results:
		orderedRes[i] = res

	qOut.join()

	qIn.close()
	qOut.close()
	return orderedRes

コード例 #5

0

ファイルを表示

def curate(db):
    art_queue = JoinableQueue()
    feed_queue = JoinableQueue()

    for feedurl in abo_urls(db["subscriptions"]):
        feed_queue.put(feedurl)
    
    print "Downloading %i feeds.." % feed_queue.qsize()
    run_processes(__feed_worker, feed_queue, art_queue)

    print "Downloading %i articles.." % art_queue.qsize()
    run_processes(__art_worker, art_queue)

    print "Done."

コード例 #6

0

ファイルを表示

ファイル: test_algolia.py プロジェクト: GalakFayyar/TabordNG

def test_basic():
    in_queue = JoinableQueue()

    algolia_reader = Algoliaio("MyAppID", "MyKey", 1000)
    algolia_reader.scan_and_queue(in_queue, p_index="INT_Rubriques",p_query=None, p_connect_timeout=30, p_read_timeout=60)

    assert in_queue.qsize() > 2600

コード例 #7

0

ファイルを表示

ファイル: test_mongo.py プロジェクト: GalakFayyar/TabordNG

def test_basic():
    in_queue = JoinableQueue()

    mongo_reader = Mongoio(p_host='localhost',p_port='27017',p_user='******',p_password='******',p_base='ACTIVITE',p_rs_xtra_nodes=['localhost:27018','localhost:27019'],p_rs_name='rs0')
    mongo_reader.scan_and_queue(in_queue,p_collection='rubriques', p_query={})

    assert in_queue.qsize() > 2600

コード例 #8

0

ファイルを表示

class TaskManager:
    # noinspection PyPep8Naming
    def __init__(self,
                 jobs_queue_capacity: int,
                 workers_num: int,
                 WorkerClass: Worker.__class__ = Worker):
        # empty job queue
        self._queue = JoinableQueue(maxsize=jobs_queue_capacity)
        logger.info(
            f'Queue size set to accept at most {jobs_queue_capacity} before pausing job assignment.'
        )
        self.WorkerClass = WorkerClass
        self.workers_num = max_number_of_workers(workers_num)

    _workers = []

    def wake_up_workers(self):
        self._workers: List[Worker] = [
            self.WorkerClass(self._queue) for _ in range(self.workers_num)
        ]
        for worker in self._workers:
            worker.start()

    def assign_task(self, job: Task):
        self._queue.put(job)

    def stop_workers(self):
        logger.info('waiting all workers to finish')
        # usual termination condition is to put None on the queue. Queues are FIFO but from Python 3.8 docs:
        # https://docs.python.org/3.8/library/multiprocessing.html#pipes-and-queues
        # "If multiple processes are enqueuing objects, it is possible for the objects to be received at the other
        # end out-of-order. However, objects enqueued by the same process will always be in the expected order
        # with respect to each other.". So, when there's a single producer, that's not an issue; when there are many
        # producers it may happen that even if Nones are enqueued at the end of the queue, consumers pick 'em
        # before other items in the queue (breaking the FIFO assumption). In this case the workers would leave
        # before the queue is empty. To avid this, before sending Nones, it's better to wait for the queue to be
        # consumed.

        while not self._queue.empty(
        ):  # not bullet-proof as empty() and qsize() return approx. values, but it helps
            print(f"jobs waiting to be assigned: {self._queue.qsize()}")
            sleep(1)
        for _ in self._workers:
            self._queue.put(None, block=True, timeout=None)
        self._queue.join()
        logger.info('all processes_finished')

    def discard_waiting_tasks(self):
        while not self._queue.empty():
            try:
                self._queue.get(False)
            except Empty:
                continue
            self._queue.task_done()

    def number_of_waiting_tasks(self):
        return self._queue.qsize()

コード例 #9

0

ファイルを表示

ファイル: udf.py プロジェクト: HazyResearch/snorkel

    def apply_mt(self, xs, parallelism, **kwargs):
        """Run the UDF multi-threaded using python multiprocessing"""
        if snorkel_conn_string.startswith('sqlite'):
            raise ValueError('Multiprocessing with SQLite is not supported. Please use a different database backend,'
                             ' such as PostgreSQL.')

        # Fill a JoinableQueue with input objects
        in_queue = JoinableQueue()
        for x in xs:
            in_queue.put(x)

        # If the UDF has a reduce step, we collect the output of apply in a
        # Queue. This is also used to track progress via the the UDF sentinel
        out_queue = JoinableQueue()

        # Keep track of progress counts
        total_count = in_queue.qsize()
        count = 0

        # Start UDF Processes
        for i in range(parallelism):
            udf = self.udf_class(in_queue=in_queue, out_queue=out_queue,
                add_to_session=(self.reducer is None), **self.udf_init_kwargs)
            udf.apply_kwargs = kwargs
            self.udfs.append(udf)

        # Start the UDF processes, and then join on their completion
        for udf in self.udfs:
            udf.start()

        while any([udf.is_alive() for udf in self.udfs]) and count < total_count:
            y = out_queue.get()

            # Update progress whenever an item was processed
            if y == UDF.TASK_DONE_SENTINEL:
                count += 1
                if self.pb is not None:
                    self.pb.update(1)

            # If there is a reduce step, do now on this thread
            elif self.reducer is not None: 
                self.reducer.reduce(y, **kwargs)
                out_queue.task_done()

            else:
                raise ValueError("Got non-sentinel output without reducer.")

        if self.reducer is None:
            for udf in self.udfs:
                udf.join()
        else:
            self.reducer.session.commit()
            self.reducer.session.close()

        # Flush the processes
        self.udfs = []

コード例 #10

0

ファイルを表示

ファイル: udf.py プロジェクト: MonaNashaat/Asterisk

    def apply_mt(self, xs, parallelism, **kwargs):
        """Run the UDF multi-threaded using python multiprocessing"""
        if Asterisk_conn_string.startswith('sqlite'):
            raise ValueError('Multiprocessing with SQLite is not supported. Please use a different database backend,'
                             ' such as PostgreSQL.')

        # Fill a JoinableQueue with input objects
        in_queue = JoinableQueue()
        for x in xs:
            in_queue.put(x)

        # If the UDF has a reduce step, we collect the output of apply in a
        # Queue. This is also used to track progress via the the UDF sentinel
        out_queue = JoinableQueue()

        # Keep track of progress counts
        total_count = in_queue.qsize()
        count = 0

        # Start UDF Processes
        for i in range(parallelism):
            udf = self.udf_class(in_queue=in_queue, out_queue=out_queue,
                add_to_session=(self.reducer is None), **self.udf_init_kwargs)
            udf.apply_kwargs = kwargs
            self.udfs.append(udf)

        # Start the UDF processes, and then join on their completion
        for udf in self.udfs:
            udf.start()

        while any([udf.is_alive() for udf in self.udfs]) and count < total_count:
            y = out_queue.get()

            # Update progress whenever an item was processed
            if y == UDF.TASK_DONE_SENTINEL:
                count += 1
                if self.pb is not None:
                    self.pb.update(1)

            # If there is a reduce step, do now on this thread
            elif self.reducer is not None: 
                self.reducer.reduce(y, **kwargs)
                out_queue.task_done()

            else:
                raise ValueError("Got non-sentinel output without reducer.")

        if self.reducer is None:
            for udf in self.udfs:
                udf.join()
        else:
            self.reducer.session.commit()
            self.reducer.session.close()

        # Flush the processes
        self.udfs = []

コード例 #11

0

ファイルを表示

ファイル: test_algolia.py プロジェクト: GalakFayyar/TabordNG

def test_basic():
    in_queue = JoinableQueue()

    algolia_reader = Algoliaio("MyAppID", "MyKey", 1000)
    algolia_reader.scan_and_queue(in_queue,
                                  p_index="INT_Rubriques",
                                  p_query=None,
                                  p_connect_timeout=30,
                                  p_read_timeout=60)

    assert in_queue.qsize() > 2600

コード例 #12

0

ファイルを表示

ファイル: parse_tweet.py プロジェクト: danielcorreia96/NetworkScience_Proj

def start_multiprocessing(jsonfilename):
    work = JoinableQueue()
    STOP_TOKEN = "STOP!"
    filenames = []
    # start for workers

    procs = []
    for i in range(int(cpu_count() - 8)):
        # slice .gz and unique identifier
        outfile = "%s_%d.txt" % (jsonfilename[5:-3], i)
        filenames.append(outfile)
        # reset output file
        open(outfile, "w").close()
        t = Process(target=do_work, args=(work, outfile, STOP_TOKEN))
        t.daemon = True
        t.start()
        procs.append(t)

    # produce data
    with gzip.GzipFile(jsonfilename, "r") as fin:
        for line in fin:
            work.put(line)
            if work.qsize() > 300000:  # try to avoid too much memory usage
                print("sleeping for 20 seconds to relax memory usage...")
                time.sleep(20)
                print("back to work! current queue size: %d" % (work.qsize()))

    work.put(STOP_TOKEN)
    print("Waiting to join processes...")
    for p in procs:
        p.join()

    print("collect output of multiple jobs into a single file??")
    with open("full_%s.out" % (jsonfilename[5:-3]), 'w') as outfile:
        for fname in filenames:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

コード例 #13

0

ファイルを表示

    def apply_mt(self, xs, parallelism, **kwargs):
        """Run the UDF multi-threaded using python multiprocessing"""
        if not _meta.postgres:
            raise ValueError(
                "Fonduer must use PostgreSQL as a database backend.")

        # Fill a JoinableQueue with input objects
        in_queue = JoinableQueue()
        for x in xs:
            in_queue.put(x)

        # Use an output queue to track multiprocess progress
        out_queue = JoinableQueue()

        # Track progress counts
        total_count = in_queue.qsize()
        count = 0

        # Start UDF Processes
        for i in range(parallelism):
            udf = self.udf_class(in_queue=in_queue,
                                 out_queue=out_queue,
                                 worker_id=i,
                                 **self.udf_init_kwargs)
            udf.apply_kwargs = kwargs
            self.udfs.append(udf)

        # Start the UDF processes, and then join on their completion
        for udf in self.udfs:
            udf.start()

        while any([udf.is_alive()
                   for udf in self.udfs]) and count < total_count:
            y = out_queue.get()

            # Update progress bar whenever an item is processed
            if y == UDF.TASK_DONE:
                count += 1
                if self.pb is not None:
                    self.pb.update(1)
            else:
                raise ValueError("Got non-sentinal output.")

        for udf in self.udfs:
            udf.join()

        # Terminate and flush the processes
        for udf in self.udfs:
            udf.terminate()
        self.udfs = []

コード例 #14

0

ファイルを表示

def test_basic():
    in_queue = JoinableQueue()

    mongo_reader = Mongoio(
        p_host='localhost',
        p_port='27017',
        p_user='******',
        p_password='******',
        p_base='ACTIVITE',
        p_rs_xtra_nodes=['localhost:27018', 'localhost:27019'],
        p_rs_name='rs0')
    mongo_reader.scan_and_queue(in_queue, p_collection='rubriques', p_query={})

    assert in_queue.qsize() > 2600

コード例 #15

0

ファイルを表示

ファイル: test_mysql.py プロジェクト: GalakFayyar/TabordNG

def test_basic():
    in_queue = JoinableQueue()

    mysql_reader = Mysqlio('localhost','3600','test','root','') 
    mysql_reader.scan_and_queue(in_queue,"SELECT * FROM swallow")

    assert in_queue.qsize() == 3

    res = []
    while not in_queue.empty():
        res.append(in_queue.get())

    expected_res = [{'id':1,'libelle':'test'},{'id':2,'libelle':'john'},{'id':3,'libelle':'woo'}]

    assert res == expected_res

コード例 #16

0

ファイルを表示

ファイル: lj-wipe.py プロジェクト: mgrechukh/lj-wipe

class TaskControl:
    def __init__(self, cls_worker, count, *args, **kwargs):
        self.queue = JoinableQueue()
        self.stopped = Event()
        self.count_processed = Value('i', 0)

        self.processes = [cls_worker(self, *args) for _ in range(count)]
        map(Process.start, self.processes)

    def is_active(self):
        return not self.stopped.is_set()

    def is_alive(self):
        alive = filter(bool, map(Process.is_alive, self.processes))
        print '---- %d child processes are still alive' % len(alive)
        return alive

    def stop(self):
        self.stopped.set()
        self.queue.close()
        print '-- waiting for processes to finish'
        map(Process.join, self.processes)
        self.queue.cancel_join_thread()

    def send_chunk(self, items):
        map(self.queue.put, items)
        print '--- waiting for queue to complete'
        while self.get_stats()[1] and self.is_alive():
            time.sleep(1)

    def get(self):
        while self.is_active():
            try:
                yield self.queue.get(timeout=1)
            except Queue.Empty:
                pass

    def tick(self):
        self.queue.task_done()
        self.count_processed.value += 1
        if not self.count_processed.value % 20:
            print '%d items processed' % self.count_processed.value
        time.sleep(0.5)

    def get_stats(self):
        stats = self.count_processed.value, self.queue.qsize()
        print '--- %d items processed, %d queued' % stats
        return stats

コード例 #17

0

ファイルを表示

def pdf_workers(workers_count=2, debug=True, files=0):
    if db.is_closed():
        db.connect()

    additional = Job.select().where(Job.status == 1).count()

    Job.update(status=0).where(Job.status == 1).execute()

    progress_bar = ProcessBar(additional + files, debug=files != 0)

    queue = JoinableQueue()
    db_queue = Queue()
    workers = Workers(workers_count, queue, db_queue, files == 0)

    while True:
        if files != 0 and Job.select().where(Job.status != 2).count() == 0:
            break

        progress_bar.update((workers.cnt.value - progress_bar.value))
        if not db_queue.empty():
            jobs = []
            while not db_queue.empty():
                jobs.append(db_queue.get())
            with db.atomic():
                Job.bulk_update(jobs, fields=["status"], batch_size=100)
        elif workers_count - queue.qsize() > 0:
            additional_jobs = min(
                Job.select().where(Job.status == 0).count(),
                len(workers) - workers.pending.value,
            )
            if additional_jobs == 0:
                continue
            jobs = []
            for job in (
                Job.select()
                .where(Job.status == 0)
                .paginate(1, additional_jobs)
            ):
                job.status = 1
                jobs.append(job)
            with db.atomic():
                Job.bulk_update(jobs, fields=["status"], batch_size=100)
            for job in jobs:
                queue.put(job)

    queue.join()
    db.close()

コード例 #18

0

ファイルを表示

ファイル: np_multi.py プロジェクト: tbraun89/research-textmining

def corpus_analyser(corpus_path, process_count):
    print('Scanning corpora...')

    file_queue = JoinableQueue()
    hearst_dict = Manager().dict()
    word_count = Value('i', 0)

    for root, subFolders, files in os.walk(corpus_path):
        for current_file in files:
            if current_file.endswith(".txt"):
                file_queue.put(os.path.join(root, current_file))

    file_count = file_queue.qsize()
    print "{0} files found.\n".format(file_count)

    sys.stdout.write("\r0.00%\tWord count: 0")

    def worker(process_id):
        while not file_queue.empty():
            current_path = file_queue.get()

            with open(current_path, 'r') as current_file:
                data = ' '.join(current_file.read().replace('\n', ' ').split())

            data_tokenized = word_tokenize(data)
            word_count.value += len(data_tokenized)

            data_tagged = pos_tag(data_tokenized)
            hearst_patterns(data_tagged, hearst_dict)

            percentage = 100.0 - ((float(file_queue.qsize()) / float(file_count)) * 100.0)
            sys.stdout.write("\r{0:.2f}%\tWord count: {1}".format(percentage, word_count.value))
            sys.stdout.flush()

            file_queue.task_done()

    for pid in range(process_count):
        process = Process(target=worker, args=(pid,))
        process.daemon = True
        process.start()

    file_queue.join()
    print "\n"

    return hearst_dict

コード例 #19

0

ファイルを表示

ファイル: factory.py プロジェクト: peteranny/research

    def run(this):
        Factory.pg("configuration test...")
        this.test() # check configuration

        source = this.source
        fout_name = this.fout_name
        func = this.func
        fparam = this.fparam
        num_workers = this.num_workers
        worker = this.worker
        progressor = this.progressor

        # queue settings
        Factory.pg("arranging source elements...")
        from multiprocessing import JoinableQueue,Process
        in_queue = JoinableQueue()
        for item in source:
            in_queue.put(item)

        # worker progressing
        progressor = Process(target=progressor, args=(in_queue, in_queue.qsize()))
        import time
        start_time = time.time()
        progressor.start()

        # worker settings 
        fouts, workers = [], []
        for w_id in xrange(num_workers):
            fouts.append(open("%s_part%d"%(fout_name,w_id),"w"))
            workers.append(Process(target=worker, args=(w_id, in_queue, func, fparam, fouts[w_id])))
            workers[w_id].start()

        # post processing
        in_queue.join()
        for w_id in xrange(num_workers):
            workers[w_id].terminate()
        progressor.terminate()
        end_time = time.time()
        Factory.pg("working done (%.1fs lapsed)"%(end_time - start_time), br=True)
        import os
        os.system("cat %s_part* > %s"%(fout_name,fout_name))
        os.system("rm -f %s_part*"%(fout_name))

コード例 #20

0

ファイルを表示

ファイル: AquisThreads.py プロジェクト: ketanpurohit0/python

def worker(queue: JoinableQueue, securitiesDictionary: SecuritiesDict,
           orderStatistics: OrderStatisticsAggregator):
    """[The worker that will run in a thread]

    Args:
        queue (JoinableQueue): [The queue from which items will be taken]
        securitiesDictionary (SecuritiesDict): [Contains securities data keyed by securityId]
        orderStatistics (OrderStatisticsAggregator): [Contains aggregated order data]
    """
    counter: int = 0
    notEmpty: bool = True

    while notEmpty and queue.qsize():
        try:
            jsonStr = queue.get(block=True, timeout=5.0)
            jsonStr = jsonStr.decode('ASCII')
            innerProcessor(jsonStr, securitiesDictionary, orderStatistics)
            queue.task_done()
            counter += 1
        except Empty:
            notEmpty = False

コード例 #21

0

ファイルを表示

ファイル: test_mysql.py プロジェクト: GalakFayyar/TabordNG

def test_basic():
    in_queue = JoinableQueue()

    mysql_reader = Mysqlio('localhost', '3600', 'test', 'root', '')
    mysql_reader.scan_and_queue(in_queue, "SELECT * FROM swallow")

    assert in_queue.qsize() == 3

    res = []
    while not in_queue.empty():
        res.append(in_queue.get())

    expected_res = [{
        'id': 1,
        'libelle': 'test'
    }, {
        'id': 2,
        'libelle': 'john'
    }, {
        'id': 3,
        'libelle': 'woo'
    }]

    assert res == expected_res

コード例 #22

0

ファイルを表示

class ProcessBlock(Process, ABC):
    """
    The abstract class for a block/process in an execution pipeline
    """
    # Arbitrary timeout for blocking queue operations
    _poll_interval = 1

    def __init__(self, *args, parent=None, queue_size=0, **kwargs):
        super().__init__(*args, **kwargs)

        # Events (in the order they should be checked)
        self.events = OrderedDict([
            ("cancel", Event()),
            ("requeue", Event()),
            ("stop", Event()),
            ])

        # Corresponding event handlers
        self.event_handlers = {
            "cancel": self._cancel_handler,
            "requeue": self._requeue_handler,
            "stop": self._stop_handler,
            }

        # Master event, to be set after any other event
        self.event = Event()

        # The family of the processblock
        siblings = copy(parent.family.children) if parent is not None else []
        self.family = BlockFamily(parent, siblings, [])
        # Link family with self
        self.family.link(self)

        # The object queue
        self.objs = JoinableQueue(queue_size)
        # List of objects that were canceled and need re-processing
        self._canceled_objs = deque()

        # Logging facility
        self.logger = getLogger(self.name)

        # Object currently processed
        self._obj = None

    def start(self):
        super().__init__(name=self.name)
        super().start()

    @abstractmethod
    def process_obj(self, obj):
        """
        The actual work a block wants to perform on a object
        """
        raise NotImplementedError()

    def _stop_handler(self):
        """
        Send the "end object" (None) to every child
        """
        self.logger.debug("sending the 'end object' to child processes...")
        for _ in self.family.alive_children():
            self.objs.put(None)

    def cancel(self):
        """
        Set the cancel event and the master event
        """
        self.events["cancel"].set()
        self.event.set()

    def _cancel_handler(self):
        """
        Cancel children's objects and re-queue them in self._canceled_objs
        """
        self.logger.debug("ask children to requeue their objects")
        for child in self.family.alive_children():
            child.events["requeue"].set()
            child.event.set()

        self.logger.debug("fetching canceled objects...")
        while (self.objs.qsize() != 0 or
               any(child.events["requeue"].is_set()
                   for child in self.family.alive_children())):
            try:
                obj = self.objs.get_nowait()
                self.objs.task_done()
            except Empty:
                continue
            if obj is not None:
                self._canceled_objs.append(obj)

        # To be able to stop without the parent block sending an 'end object'
        if self.events["stop"].is_set():
            self._canceled_objs.append(None)
            self.events["stop"].clear()

        # Clear the event
        self.events["cancel"].clear()

    def _requeue_handler(self):
        """
        Requeue every object managed by the block or one of its children
        """
        for child in self.family.alive_children():
            child.events["requeue"].set()
            child.event.set()

        self.logger.debug("requeueing objects...")
        if self._obj is not None:
            self.family.parent.objs.put(self._obj)
            self._obj = None

        while (self.objs.qsize() != 0 or
               any(child.events["requeue"].is_set()
                   for child in self.family.alive_children())):
            try:
                obj = self.objs.get_nowait()
                self.objs.task_done()
            except Empty:
                # Do not waste that time
                if self._canceled_objs:
                    obj = self._canceled_objs.popleft()
                else:
                    continue
            if obj is not None:
                self.family.parent.objs.put(obj)

        for obj in filter(lambda x: x is not None, self._canceled_objs):
            self.family.parent.objs.put(obj)

        self.logger.debug("wait for parent to fetch all the objects...")
        self.family.parent.objs.join()

        # Processblock was potentially stopped
        self.events["stop"].clear()

        # Clear the event
        self.events["requeue"].clear()

    def _process_events(self, ignore=()):
        """
        Process events

        The order in which events are processed is important
        Returns:
            True --- if an Event was processed
            False --- otherwise
        """
        self.logger.debug("process events...")
        if not self.event.is_set():
            return False
        self.event.clear()

        event_processed = False
        for event_name in self.events:
            if event_name in ignore:
                continue
            if self.events[event_name].is_set():
                self.logger.debug("processing '%s' event", event_name)
                self.event_handlers[event_name]()
                event_processed = True

        return event_processed

    def get_obj(self, timeout=None):
        """
        Get an object from the parent block
        """
        self.logger.debug("get an object to process...")
        try:
            return self._canceled_objs.popleft()
        except IndexError:
            obj = self.family.parent.objs.get(timeout=timeout)
            self.family.parent.objs.task_done()
            return obj

    def try_publish_obj(self, obj, poll_interval=None):
        """
        Publish `obj` to child blocks (unless `obj` is None)

        Returns: True if `obj` was published
                 False if an event occured before `obj` was published
        """
        if obj is None:
            return True

        if not self.family.children:
            self.logger.debug("no one to pass '%s' onto", obj)
            return True

        self.logger.debug("publish '%s'", obj)
        while not self.event.is_set():
            try:
                self.objs.put(obj, timeout=poll_interval)
            except Full:
                continue
            return True

        # An event occured
        self.logger.debug("publication was interrupted by an event")
        return False

    def _cleanup(self):
        """
        Tell parent and siblings we stop and exit cleanly
        """
        if self.family.parent is not None:
            self.family.parent.event.set()
        for sibling in self.family.siblings:
            sibling.event.set()
        self.logger.debug("waiting for child processes...")
        for child in self.family.children:
            child.join()

    def run(self):
        """
        Launch child blocks and process objects
        """
        # Launch child blocks
        # Children are started here in order to build a gracefull process tree
        self.logger.debug("start %d child(ren)", len(self.family.children))
        for child in self.family.children:
            child.start()

        while not self.events["stop"].is_set():
            # Processing loop
            while not self.events["stop"].is_set():
                # Process exterior events
                if self._process_events():
                    continue

                # Find an object to process
                if self._obj is None:
                    try:
                        self._obj = self.get_obj(timeout=self._poll_interval)
                    except Empty:
                        continue

                    if self._obj is None:
                        self.logger.debug("received the 'end object'")
                        self.events["stop"].set()
                        self.event.set()
                        continue

                obj = self._obj

                # Process the object
                self.logger.debug("process '%s'", obj)
                try:
                    obj = self.process_obj(obj)
                except ProcessingError as exc:
                    self.logger.warning(exc)
                    continue
                except EventInterrupt:
                    # An event ocrrured, process it
                    continue

                # Publish the processed object, check for events periodically
                if self.try_publish_obj(obj,
                                        poll_interval=self._poll_interval):
                    # Object was published, or did not need to be
                    self._obj = None

            # Process the stop event (which is ignored in the loop underneath)
            self._process_events()

            # Wait for the entire family to stop, unless `stop` gets cleared
            while (self.events["stop"].is_set() and
                   not self.family.is_stopped()):
                self.event.wait()
                self._process_events(ignore=("stop",))

        # Process is exiting, there is no turning back
        # Every sibling/child process will shortly do so too (or already have)
        self._cleanup()
        self.logger.debug("terminating")

コード例 #23

0

ファイルを表示

class MPController(object):
    """
    Main MP object which maintains various queue and task objects and info.
    Launches generic client task processors as independant processes, populates
    the input queue and manages the writer process.
    """
    def __init__(self, heartbeat=None, numProc=None, chunkSize=1):
        self.heartbeat = heartbeat  # Heartbeat output manager
        self.tasks = JoinableQueue()  # Tasks for processing to be added here
        self.results = Queue()  # Processed results accumulate here
        self.writerConn = Pipe()  # Direct pipe to writer manager
        self.writer = self.writerConn[0]
        self.resultDict = {}  # Info and orderedStream objects
        if numProc is None:
            numProc = max(cpu_count() - 2, 1)
        self.nproc = numProc
        self.chunkSize = chunkSize
        if self.heartbeat:
            #heartbeat.Lock = Lock()
            heartbeat.message("Launching %d sub-processes" % self.nproc, True)
            heartbeat.message(
                "Tasks will process pairs in chunks of %d" % self.chunkSize,
                True)
        else:
            print "CONTROLLER WILL LAUNCH %d sub-processes" % self.nproc
            print "Tasks will process pairs in chunks of %d" % self.chunkSize
            sys.stdout.flush()
        #self.configureQueueLimits()
        self.tasksRunning = 0
        self.Counter = Value('L', 0)  # Shared value: counts processed pairs
        self.recordsProcessed = 0
        tLock = Lock()
        tValue = Value('L', 0)
        self.workers = [
            TaskProcessor(self.tasks, self.results, tValue, tLock)
            for i in xrange(self.nproc)
        ]

    def configureQueueLimits(self):
        availGB = AvailableRAM()
        self.HIGH_WATERMARK = int(WATERMARK_BASE * availGB)
        self.LOW_WATERMARK = int(round(self.HIGH_WATERMARK * 0.50))

    def add(self, task, qhigh, qlow):
        """Block if Qsize above a certain high watermark in item size, and
           don't release until it has fallen below the low watermark"""
        try:
            qlen = self.tasks.qsize()
            if qlen > qhigh:
                print "Throttling input, reached HWM:", qhigh
                while qlen > qlow:
                    delay = random.randint(1, 10)
                    time.sleep(delay)
                    qlen = self.tasks.qsize()
                print "Throttling released, down to LWM:", qlow
        except NotImplementedError:
            # Skip on Mac OS X (WARNING - use on OS X in testing only, queue
            # size will max out at a paltry 32768 items)
            pass
        try:
            self.tasks.put(task)
            self.recordsProcessed += task.datalen
        except qFull:
            # While testing: we shouldn't hopefully end up here...
            print "ERR: queue full"
            sys.exit(-1)

    def finishQueue(self):
        for i in xrange(self.nproc):
            self.tasks.put(None)
        if self.heartbeat is not None:
            self.heartbeat.total = self.recordsProcessed
            newCount = self.recordsProcessed
            self.heartbeat.message("Definitive pair count: %d" % newCount,
                                   True)

    def start(self):
        i = 0
        for worker in self.workers:
            worker.start()
            i += 1
        self.tasksRunning = self.nproc

    def OneTaskDone(self):
        self.tasksRunning -= 1

    def wait(self):
        self.tasks.join()
        if self.heartbeat:
            self.heartbeat.count = self.recordsProcessed

    def finishProcesses(self):
        self.writer.close()

    def getpids(self):
        pids = []
        for w in self.workers:
            pids.append(w.pid)
        return pids

    def Send(self, obj):
        self.writer.send(obj)

    def Recv(self):
        return self.writer.recv()

コード例 #24

0

ファイルを表示

ファイル: WebDav.py プロジェクト: rndviktor/sprutio-rpc

class WebDav:
    NUM_WORKING_PROCESSES = 5

    def __init__(self, host, user, passwd, timeout=-999, logger=None):
        self.fp = dict()

        webdav_host = host

        self.webdav_host = webdav_host
        self.host = host
        self.user = user
        self.passwd = passwd
        self.processes = []
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)

        self.is_alive = {
            "status": True
        }

        options = {
            'webdav_hostname': self.webdav_host,
            'webdav_login': self.user,
            'webdav_password': self.passwd
        }

        self.webdavClient = wc.Client(options)

        self.logger = logger
        self._tzinfo = TimeZoneMSK()

    def parent(self, path):
        return urn.Urn(path).parent()

    def path(self, path):
        return urn.Urn(path).path()

    def generate_file_info(self, file_path):
        info = self.webdavClient.info(file_path)

        is_dir = False
        is_link = False

        if self.webdavClient.is_dir(file_path):
            is_dir = True
        else:
            pass

        file_name = urn.Urn(file_path).filename().replace("/", "")
        file_dir = urn.Urn(file_path).parent()

        ext = ''
        divide = file_name.split('.')
        if len(divide) > 1:
            ext = file_name.split('.')[-1].lower()

        mtime = info['modified']

        file_info = {
            "is_dir": is_dir,
            "is_link": is_link,
            "name": file_name,
            "ext": ext,
            "path": file_dir,
            "owner": self.user,
            "mode": "600",
            "size": info['size'] if not is_dir else 0,
            "mtime": mtime,
            'mtime_str': str(mtime),
        }
        return file_info

    def _make_file_info(self, file_queue, result_queue, logger, timeout):
        while int(time.time()) < timeout:
            if file_queue.empty() is not True:
                file_path = file_queue.get()
                try:
                    file_info = self.generate_file_info(file_path)
                    result_queue.put(file_info)
                except UnicodeDecodeError as unicode_e:
                    logger.error(
                        "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                except IOError as io_e:
                    logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                except Exception as other_e:
                    logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                finally:
                    file_queue.task_done()
            else:
                time.sleep(REQUEST_DELAY)

    @staticmethod
    def to_byte(value):
        if isinstance(value, str):
            try:
                value = value.encode("utf-8")
            except UnicodeDecodeError:
                value = value.encode("ISO-8859-1")
        return value

    def size(self, path):
        try:
            return self.webdavClient.info(path)['size']
        except Exception as e:
            self.logger.error("Error in WebDav size(): %s, traceback = %s" % (str(e), traceback.format_exc()))
            return 0

    def info(self, path):
        return self.webdavClient.info(self.to_byte(path))

    def exists(self, path):
        return self.webdavClient.check(path)

    def isdir(self, path):
        return self.webdavClient.is_dir(path)

    def isfile(self, path):
        return not self.webdavClient.is_dir(self.to_byte(path))

    def list(self, path):
        flist = {
            "path": path,
            "items": []
        }

        try:
            self.webdavClient.check('/')
        except Exception:
            raise Exception("Error during establishing webdav connection")

        listdir = self.webdavClient.list(self.to_byte(path))
        self.logger.info("listdir=%s", listdir)

        time_limit = int(time.time()) + TIMEOUT_LIMIT

        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)

        for i in range(self.NUM_WORKING_PROCESSES):
            p = Process(target=self._make_file_info, args=(self.file_queue, self.result_queue, self.logger, time_limit))
            p.start()
            proc = psutil.Process(p.pid)
            proc.ionice(psutil.IOPRIO_CLASS_IDLE)
            proc.nice(20)
            self.logger.debug(
                    "ListDir worker #%s, set ionice = idle and nice = 20 for pid %s" % (
                        str(i), str(p.pid)))
            self.processes.append(p)

        for name in listdir:
            try:
                item_path = '{0}/{1}'.format(path, name)
                self.file_queue.put(item_path)
            except UnicodeDecodeError as e:
                self.logger.error(
                    "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

            except IOError as e:
                self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

            except Exception as e:
                self.logger.error(
                    "Exception %s, %s" % (str(e), traceback.format_exc()))

        while not self.file_queue.empty():
            self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % (
                self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit))
            time.sleep(REQUEST_DELAY)

        if self.file_queue.empty():
            self.logger.debug("join() file_queue until workers done jobs")
            self.file_queue.join()

        for p in self.processes:
            try:
                self.logger.debug("WebDav ListDir terminate worker process, pid = %s" % p.pid)
                kill(p.pid, signal.SIGKILL, self.logger)
            except OSError:
                self.logger.error(
                    "ListDir unable to terminate worker process, pid = %s" % p.pid)

        if self.is_alive['status'] is True:
            while not self.result_queue.empty():
                file_info = self.result_queue.get()
                flist["items"].append(file_info)

        return flist

    def listdir(self, path):
        listdir = self.webdavClient.list(path)

        listing = []
        for name in listdir:
            item_path = '{0}/{1}'.format(path, name)
            listing.append(item_path)
        return listing

    def remove(self, target):
        try:
            self.logger.debug("Removing target=%s" % target)
            if self.isdir(target):
                target += '/'
            self.webdavClient.unpublish(target)
            self.webdavClient.clean(target)
        except Exception as e:
            self.logger.error("Error in WebDav dir remove(): %s, traceback = %s" % (str(e), traceback.format_exc()))
            raise Exception

    def mkdir(self, path):
        self.logger.debug("Creating directory=%s" % path)
        return self.webdavClient.mkdir(path)

    def upload(self, source, target, overwrite=False, rename=None, operation_progress=None):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if rename is not None:
                target_path = os.path.join(target, rename)
            else:
                target_path = os.path.join(target, source)

            if not overwrite and self.exists(target_path):
                failed.append(source)
                raise Exception("File '%s' already exists and overwrite not permitted" % target_path)

            try:
                self.logger.debug("Uploading target_path=%s, source=%s" % (target_path, source))
                self.webdavClient.upload(target_path, source, operation_progress)
            except Exception as e:
                failed.append(source)
                self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc()))
                raise Exception("Error during file uploading %s" % traceback.format_exc())

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc()))

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def download(self, source, target, operation_progress=None):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            target_path = os.path.join(target, os.path.basename(source))

            try:
                self.logger.debug("Downloading source=%s, target_path=%s" % (source, target_path))
                self.webdavClient.download(source, target_path, operation_progress)
            except Exception as e:
                failed.append(source)
                self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc()))
                raise Exception("Error during file download")

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc()))

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def copy_file(self, source, target, overwrite=False):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if not overwrite and self.exists(target):
                failed.append(source)
                raise Exception('file exist and cannot be overwritten')

            try:
                self.logger.debug("Copying file source=%s, target=%s" % (source, target))
                self.webdavClient.copy(source, target)

            except Exception as e:
                failed.append(source)
                raise Exception('Cannot copy file %s' % (e,))

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def move_file(self, source, target, overwrite=False):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if not overwrite and self.exists(target):
                failed.append(source)
                raise Exception('file exist and cannot be overwritten')

            try:
                self.logger.debug("Moving file source=%s, target=%s" % (source, target))
                self.webdavClient.move(source, target)

            except Exception as e:
                failed.append(source)
                raise Exception('Cannot move file %s' % (e,))

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def make_destination_dir(self, destination, overwrite):
        self.logger.info("making destination %s" % destination)
        if not self.exists(destination):
            self.mkdir(destination)
        elif overwrite and self.exists(destination) and not self.isdir(destination):
            self.remove(destination)
            self.mkdir(destination)
        elif not overwrite and self.exists(destination) and not self.isdir(destination):
            raise Exception("destination is not a dir")
        else:
            pass

コード例 #25

0

ファイルを表示

ファイル: identify_addresses_etherscan.py プロジェクト: modernblockchains/newkidsontheblock

    db = psycopg2.connect("dbname='blockchain_ethereum4' user='******' host='localhost' password=''", cursor_factory=psycopg2.extras.DictCursor)
    cursor = db.cursor()
    cursor.execute("SELECT DISTINCT(address) FROM ethereum.Addresses");
    work_queue = JoinableQueue()
    addresses_count = 0

    for i in range(NUM_WORKERS):
        CrawlerWorkerProcess(work_queue).start()

    for row in cursor.fetchall():
        work_queue.put(row.get("address"))
        addresses_count += 1
    print("Looking up %d addresses on Etherscan.io using %d workers." % (addresses_count, NUM_WORKERS))

    last_time = time.time()
    previous_addresses_left = work_queue.qsize()
    while work_queue.qsize() > 0:
        time_now = time.time()
        time_since = (time_now - last_time)
        addresses_left = work_queue.qsize()
        addresses_done_since_last = previous_addresses_left - addresses_left
        try:
            time_left = int(addresses_left * (time_since/addresses_done_since_last))
        except ZeroDivisionError:
            time_left = 9999999999999
        m,s = divmod(time_left, 60)
        h,m = divmod(m, 60)
        time_remaining_string = "%dh:%02dm:%02ds" % (h,m,s)
        print("#" * 50)
        print("###### Last %d addresses took: % 4.2fs - %d addresses left (%s) ######" %
                (addresses_done_since_last, time_since, addresses_left, time_remaining_string))

コード例 #26

0

ファイルを表示

ファイル: multiprocessingrequests.py プロジェクト: kracekumar/gevent_multiprocessing

def main(factor = 2):
    #E.G: if total cores is 2 , no of processes to be spawned is 2 * factor
    files_to_download = JoinableQueue()
    result_queue = JoinableQueue()
    time_taken = JoinableQueue()
    time_taken_to_read_from_queue = JoinableQueue()
    with open('downloads.txt', 'r') as f:
        for to_download in f:
            files_to_download.put_nowait(to_download.split('\n')[0])
    files_to_download_size = files_to_download.qsize()
    cores = cpu_count()
    no_of_processes = cores * factor
    for i in xrange(no_of_processes):
        files_to_download.put_nowait(None)
    jobs = []
    start = datetime.datetime.now()
    for name in xrange(no_of_processes):
        p = Process(target = download, args = (files_to_download, result_queue,\
                                time_taken, time_taken_to_read_from_queue,name))
        p.start()
        jobs.append(p)

    for job in jobs:
        job.join()
    print result_queue.qsize()
    total_downloaded_urls = 0
    try:
        while 1:
            r = result_queue.get_nowait()
            total_downloaded_urls += r

    except Empty:
        pass

    try:
        while 1:
            """
                locals() keeps track of all variable, functions, class etc.
                datetime object is different from int, one cannot perform 
                0 + datetime.datetime.now(), if when we access the queue which 
                contains time objects first time, total_time will be set to 
                first time 
            """
            if 'total_time' in locals():
                total_time += time_taken.get_nowait()
            else:
                total_time = time_taken.get_nowait()
    except Empty:
        print("{0} processes on {1} core machine took {2} time to download {3}\
              urls".format(no_of_processes, cores, total_time, \
                                          total_downloaded_urls))

    try:
        while 1:
            if 'queue_reading_time' in locals():
                queue_reading_time += time_taken_to_read_from_queue.get_nowait()
            else:
                queue_reading_time = time_taken_to_read_from_queue.get_nowait()
    except Empty:
        print("{0} processes on {1} core machine took {2} time to read {3}\
              urls from queue".format(no_of_processes, cores,queue_reading_time\
              ,files_to_download_size))

コード例 #27

0

ファイルを表示

ファイル: tagcounter.py プロジェクト: charlie4fun/clean_hr_html

class TagCounter:
    def __init__(self, domain, input_col):
        self.domain = domain
        self.input_col = input_col
        self.htmls_queue = JoinableQueue()
        self.tags_queue = Queue()
        self.tag_count = {}

    def count_tags(self):
        tag_count_start_time = datetime.now()
        self.fill_htmls_queue()

        print("Input htmls_queue size: ", self.htmls_queue.qsize())

        pool = Pool(settings.NUM_PROCESSORS, self.processor)
        self.htmls_queue.join()

        print("Output tags_queue size: ", self.tags_queue.qsize())

        tags_merge_start_time = datetime.now()
        while True:  # TODO: move tags merging to separete process (>20% processing time)
            try:
                tags_batch = self.tags_queue.get(True, 2)
            except Empty:
                print(
                    "No batches in tags_queue. tags_queue size: %d htmls_queue size: %d"
                    % (self.tags_queue.qsize(), self.htmls_queue.qsize()))
                break

            for tag, count in tags_batch.items():
                if self.tag_count.get(tag):
                    self.tag_count[tag] += count
                else:
                    self.tag_count[tag] = count

        finish_time = datetime.now()
        print("Tags merging time: %s" % (finish_time - tags_merge_start_time))

        print("Tags counting time: %s" % (finish_time - tag_count_start_time))

        pool.close(
        )  # TODO: why if we are closing pool before tags are merged tags_queue.get is getting stucked (?)

    # def persist_tags(): saves to pickle file or Mongo

    def fill_htmls_queue(self):
        domain_htmls = list(self.input_col.find(
            {'domain': self.domain}))  # TODO: get slices from mongo

        # TODO: check amount of tags in slowly processed HTMLs
        batch_size = settings.BATCH_SIZE

        html_batch_start_time = datetime.now()

        while domain_htmls:
            batch = domain_htmls[:batch_size]
            self.htmls_queue.put(
                batch)  # TODO: move html_batching to separete process
            domain_htmls = domain_htmls[batch_size:]

        print("HTMLs batching time: %s" %
              (datetime.now() - html_batch_start_time))

    def processor(self):
        pages_processed = 0
        pid = os.getpid()

        while True:
            tag_count = {}

            try:
                htmls_batch = self.htmls_queue.get(
                    True, 2)  # TODO: test changing timeout
            except Empty:
                print(
                    "No batches in htmls_queue, pid: %d tags_queue size: %d htmls_queue size: %d"
                    % (pid, self.tags_queue.qsize(), self.htmls_queue.qsize())
                )  # TODO: check if htmls_queue still exist at that moment
                break

            print("Process %d created, batch len: %d" %
                  (pid, len(htmls_batch)))

            for page in htmls_batch:

                if page.get('no_repeat_html'):
                    soup = BeautifulSoup(page['no_repeat_html'], 'html.parser')
                else:
                    soup = BeautifulSoup(page['full_page_html'], 'html.parser')

                for tag in soup.find_all():
                    stag = str(tag).strip().replace(" ", "").lower(
                    )  # TODO: add tag processing here as a separate function

                    if stag in tag_count:
                        tag_count[stag] += 1
                    else:
                        tag_count[stag] = 1

                pages_processed += 1

                if pages_processed % 100 == 0:
                    print('%s, %s, pages_processed = %d, len(tag_count) = %d' %
                          (pid, str(datetime.now()), pages_processed,
                           len(tag_count)))

            self.tags_queue.put(tag_count)
            self.htmls_queue.task_done()

    def sort_tags(self):
        self.tag_count = sorted(self.tag_count.items(),
                                key=operator.itemgetter(1),
                                reverse=True)

コード例 #28

0

ファイルを表示

class Engine(Process):
    """
    Based off `threading.Thread`
    Instantiated in server

    Contains buffer, queue for processors, processors, ConsumerThread.
    """
    def __init__(self,
                 engine_conn,
                 processors=4,
                 buffer_roll=0,
                 buffer_max_batch=50,
                 buffer_max_seconds=1,
                 test_mode=False,
                 test_outfile='engine_test_output/engine_test_output'):
        """
        Initializes with empty buffer & queue,
         set # of processors...
        :param processors: number of processors to start
        :type processors: int
        """
        logger.info("Initializing EngineThread")
        super().__init__()
        self.test_run = test_mode
        self.test_outfile = test_outfile
        self.test_batches = {}
        self.pipe_conn = engine_conn
        self.buffers_out_q = JoinableQueue()
        self.number_of_processors = processors
        self.processors = []
        self.run_engine = False
        self.buffer_record_limit = int(buffer_max_batch)
        self.buffer_time_limit_s = float(buffer_max_seconds)
        self.buffers = {}
        self.buffer_in_qs = {}
        self.buffer_workers = {}
        self.data_pullers = {}
        self.buffer_roll = -buffer_roll
        if buffer_roll > 0:
            self.buffer_roll_index = -buffer_roll
        else:
            self.buffer_roll_index = None

    def _init_processors(self):
        """Initializes + starts set number of processors"""
        for n in range(self.number_of_processors):
            processor = Processor(self.buffers_out_q, self.test_run)
            processor.start()
            self.processors.append(processor)

    def _new_buffer(self, partition_key):
        if partition_key not in self.buffers:
            self.buffers[partition_key] = np.array([{
                0: 0
            }] * (self.buffer_record_limit *
                  self.number_of_processors)).reshape(
                      self.number_of_processors, self.buffer_record_limit)
            if self.test_run:
                self.test_batches[partition_key] = 1
            if partition_key not in self.buffer_in_qs:
                self.buffer_in_qs[partition_key] = Queue()
            else:
                logger.warn(
                    f"New buffer, existing buffer_in_q for stream {partition_key}"
                )
            if partition_key not in self.buffer_workers:
                self.buffer_workers[partition_key] = Thread(
                    target=self.run_buffer, args=[partition_key])
                self.buffer_workers[partition_key].start()
            else:
                logger.warn(
                    f"New buffer, existing buffer_worker for stream {partition_key}"
                )
            return True
        else:
            return False

    def _new_data_puller(self, partition_key, template):
        if partition_key not in self.data_pullers:
            if partition_key in self.buffer_in_qs:
                self.data_pullers[partition_key] = DataPuller(
                    template, self.buffer_in_qs[partition_key])
                self.data_pullers[partition_key].start()
            else:
                raise ValueError(
                    f"Attempting to init data_puller for stream {partition_key}, buffer input queue for stream does not exist"
                )
            return True
        else:
            return False

    def run(self):
        """
        Sets up numpy array buffer and puts stuff in and gets stuff out
        """
        self._init_processors()
        self.run_engine = True

        while self.run_engine:
            if self.pipe_conn.poll():
                item = self.pipe_conn.recv()
                # branch 2 - stop engine
                if item == "stop_poison_pill":
                    for q in self.buffer_in_qs.keys():
                        self.buffer_in_qs[q].put("stop_buffer_worker")
                    self.run_engine = False
                    break
                # branch 1 - engine running, good data
                elif type(item) is tuple:
                    partition_key = item[0]['stream_token']
                    new_buffer = self._new_buffer(partition_key)
                    if new_buffer:
                        logger.info(
                            f"Initialized buffer for stream {partition_key}")
                    if item[1] == "new":
                        if item[0]["data_rules"]["pull"] is True:
                            new_puller = self._new_data_puller(
                                partition_key, item[0])
                            if new_puller:
                                print(
                                    f"Initialized data puller for stream {partition_key}"
                                )
                            else:
                                logger.warn(
                                    f"Attempting to initialize data puller for stream {partition_key} - puller already exists"
                                )
                    elif item[1] == "load":
                        self.buffer_in_qs[partition_key].put(item[0])
                    else:
                        raise TypeError(
                            "Invalid tuple in pipe - index 1 must be str 'load' or str 'new'"
                        )
                else:
                    raise TypeError("Invalid item in pipe")
        logger.info("Terminating Engine Thread")
        self.stop_engine()

    def run_buffer(self, partition_key):
        last_col = self.buffer_record_limit - 1
        last_row = self.number_of_processors - 1
        cur_row = 0
        cur_col = 0
        batch_tracker = {'start_time': time(), 'leftos_collected': False}

        while self.run_engine:
            try:
                item = self.buffer_in_qs[partition_key].get(
                    timeout=self.buffer_time_limit_s)
                # branch 2 - stop engine
                if item == "stop_buffer_worker":
                    break
                # branch 1 - engine running, good data
                elif isinstance(
                        item, DStream) or (type(item) is dict
                                           and "stream_token" in item.keys()):
                    if "data_rules" in item.keys(
                    ):  # some unit test data doesnt have this field
                        if "date_format" in item["data_rules"].keys():
                            if item["data_rules"]["date_format"] is not None:
                                item["timestamp"] = datetime.strptime(
                                    item["timestamp"], item["data_rules"]
                                    ["date_format"]).timestamp()
                    # branch 1.1 - not last row
                    if cur_row < last_row:
                        # branch 1.1a - not last column, continue row
                        if cur_col < last_col:
                            logger.info("Buffering- row {}".format(cur_row))
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            cur_col += 1
                        # branch 1.1b - last column, start new row
                        else:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            if self.test_run:
                                self.buffers_out_q.put((
                                    self.buffers[partition_key]
                                    [cur_row].copy(),
                                    f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                                ))
                                self.test_batches[partition_key] += 1
                            else:
                                self.buffers_out_q.put(
                                    self.buffers[partition_key]
                                    [cur_row].copy())
                            logger.info("New batch queued")
                            roll_window = self.buffers[partition_key][
                                cur_row, self.buffer_roll_index:]
                            cur_row += 1
                            for n in roll_window:
                                for i in range(abs(self.buffer_roll)):
                                    self.buffers[partition_key][cur_row, i] = n
                            cur_col -= cur_col + self.buffer_roll
                            # REMOVE
                            batch_tracker['start_time'] = time()
                    # branch 1.2 - last row
                    else:
                        # branch 1.2a - not last column, continue row
                        if cur_col < last_col:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            cur_col += 1
                        # branch 1.2b - last column, start return to first row in new cycle
                        else:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            if self.test_run:
                                self.buffers_out_q.put((
                                    self.buffers[partition_key]
                                    [cur_row].copy(),
                                    f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                                ))
                                self.test_batches[partition_key] += 1
                            else:
                                self.buffers_out_q.put(
                                    self.buffers[partition_key]
                                    [cur_row].copy())

                            roll_window = self.buffers[partition_key][
                                cur_row, self.buffer_roll_index:]
                            cur_row -= cur_row
                            for n in roll_window:
                                for i in range(abs(self.buffer_roll)):
                                    self.buffers[partition_key][cur_row, i] = n
                            cur_col -= cur_col + self.buffer_roll
                            batch_tracker['start_time'] = time()
                    batch_tracker['leftos_collected'] = False
                # branch 3 bad data
                else:
                    raise TypeError("Queued item is not valid dictionary.")
            except:
                # buffer time max reached, engine still running
                logger.info("Buffer batch timeout exceeded")
                if self.run_engine is True:
                    # engine running, batch timeout with new buffer data (partial row)
                    if cur_col > abs(
                            self.buffer_roll
                    ) and batch_tracker['leftos_collected'] is False:
                        logger.info(
                            "Collecting leftovers- pushing partial batch to queue after batch timeout"
                        )
                        if self.test_run:
                            self.buffers_out_q.put((
                                self.buffers[partition_key][
                                    cur_row, :cur_col].copy(),
                                f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                            ))
                            self.test_batches[partition_key] += 1
                        else:
                            self.buffers_out_q.put(self.buffers[partition_key][
                                cur_row, :cur_col].copy())
                        if cur_row < last_row:
                            cur_row += 1
                        else:
                            cur_row -= cur_row

                        cur_col -= cur_col
                        batch_tracker['start_time'] = time()
                        batch_tracker['leftos_collected'] = True
                    # leftovers already collected
                    else:
                        logger.info("No new data- resetting batch timer")
                        batch_tracker['start_time'] = time()

    def stop_engine(self):
        self.pipe_conn.close()
        if self.run_engine is True:
            self.run_engine = False
        for p in self.data_pullers.keys():
            self.data_pullers[p].pulling = False
        logger.info(self.buffers_out_q.qsize())
        self.buffers_out_q.join()
        logger.info("Queue joined")
        for p in self.processors:
            logger.info("Putting poison pills in Q")
            self.buffers_out_q.put("666_kIlL_thE_pROCess_666")
        logger.info("Poison pills done")
        for p in self.processors:
            p.join()
            logger.info("Engine shutdown- processor joined")
        print("done")

コード例 #29

0

ファイルを表示

ファイル: find_meta_research_articles.py プロジェクト: Winter-Guerra/6.806_nlp_cancer_research

    paths = findFilesInDir('/mnt/ephemeral0/xml/')

    unsearched = Queue()
    for path in paths:
        unsearched.put(path)

    print("Number of files", len(paths))

    if DEBUG:
        # Run one process
        print("DEBUG: Running single threaded.")
        parallel_worker()

    else:

        print("Starting pool")
        NUM_WORKERS = 7
        pool = Pool(NUM_WORKERS)

        results = [pool.apply_async(parallel_worker) for i in range(NUM_WORKERS)]

        print("Running progress capture.")
        while (True):
          remaining = unsearched.qsize()
          print "Waiting for", remaining, "tasks to complete..."
          time.sleep(0.5)
        #   print [result.get() for result in results]

        unsearched.join()
        print 'Done'

コード例 #30

0

ファイルを表示

ファイル: mp_primecalc.py プロジェクト: wurmfood/Random

        logging.debug("I should be exiting.")


if __name__ == "__main__":
    start_time = datetime.now()
    q = JoinableQueue(MAX_QUEUE_SIZE)
    out_queue = JoinableQueue()
    # c = Counter()
    qf = QueueFiller(q, 1, MAX_PRIME_NUMBER)
    qf.start()

    tasks = [Task(q, out_queue) for i in range(NUM_WORKERS)]
    for w in tasks:
        w.start()

    logging.info("Items left in queue: {0}".format(q.qsize()))
    logging.debug("Joining q")
    # q.join()
    # qf.join()

    if False:
        processes_active = True
        while processes_active:
            for w in tasks:
                processes_active = False or w.is_alive()
                logging.debug(w.is_alive())
            sleep(0.2)

    for y in tasks:
        y.join()

コード例 #31

0

ファイルを表示

class ProcessPool:
    """
    Class which enables multiprocess calls to custom functions
    """
    class Shared:
        """
        Object shared between processes. Sync'd by the BaseManager
        """
        def __init__(self):
            self.clear()

        def get(self):
            return self.data

        def add(self, val):
            self.data.append(val)

        def clear(self):
            self.data = []

    def __init__(self, processes_count, *args, **kwargs):
        self.sleep_length = 2
        self.processes_count = processes_count
        self.queue_jobs = JoinableQueue()
        self.processes = []

        BaseManager.register('Shared', self.Shared)
        self.manager = BaseManager()
        self.manager.start()
        self.shared = self.manager.Shared()

        for i in range(self.processes_count):
            p = Process(target=self.make_pool_call)
            p.id = i
            p.start()
            self.processes.append(p)

    def make_pool_call(self):
        while True:
            item_pickled = self.queue_jobs.get()

            if item_pickled is None:
                self.queue_jobs.task_done()
                break

            item = dill.loads(item_pickled)
            call = item.get('call')
            args = item.get('args')
            kwargs = item.get('kwargs')

            try:
                result = call(*args, **kwargs)
                self.shared.add(result)

            except Exception as e:
                import traceback
                traceback.print_exc()
                os.kill(os.getpid(), signal.SIGUSR1)

            self.queue_jobs.task_done()

    def add_job(self, job):
        """
        :param: job: has to be a dilled dict:
                     {
                         'call': function_to_be_called_by_process,
                         'args': [],
                         'kwargs': {},
                     }
        """
        self.queue_jobs.put(job)

    def finish_pool_queue(self):
        while self.queue_jobs.qsize() > 0:
            sleep(self.sleep_length)
        for i in range(self.processes_count):
            self.queue_jobs.put(None)
        self.queue_jobs.join()
        self.queue_jobs.close()
        for p in self.processes:
            p.join()
        del self.processes[:]

    def get_pool_results(self):
        return self.shared.get()

    def clear_pool_results(self):
        self.shared.clear()

コード例 #32

0

ファイルを表示

ファイル: builder.py プロジェクト: mstim/glycresoft

class MultiprocessingGlycoproteinSiteModelBuildingWorkflow(
        GlycoproteinSiteModelBuildingWorkflowBase):
    def __init__(self,
                 analyses,
                 glycopeptide_database,
                 glycan_database,
                 unobserved_penalty_scale=None,
                 lambda_limit=0.2,
                 require_multiple_observations=True,
                 observation_aggregator=None,
                 output_path=None,
                 n_threads=4,
                 q_value_threshold=0.05):
        super(MultiprocessingGlycoproteinSiteModelBuildingWorkflow,
              self).__init__(analyses,
                             glycopeptide_database,
                             glycan_database,
                             unobserved_penalty_scale,
                             lambda_limit,
                             require_multiple_observations,
                             observation_aggregator,
                             output_path,
                             q_value_threshold=q_value_threshold)

        self.builder = None

        self.input_queue = JoinableQueue(1000)
        self.output_queue = JoinableQueue(1000)
        self.input_done_event = Event()
        self.n_threads = 1
        self.n_workers = n_threads
        self.workers = []
        self._has_remote_error = False
        self.ipc_manager = self.ipc_logger()

    def prepare_glycoprotein_for_dispatch(self, glycoprotein, builder):
        prepared = builder.prepare_glycoprotein(glycoprotein)
        return prepared

    def feed_queue(self, glycoproteins, builder):
        n = len(glycoproteins)
        n_sites = self.count_glycosites(glycoproteins)
        self.log("Analyzing %d glycoproteins with %d occupied N-glycosites" %
                 (n, n_sites))
        i_site = 0
        for glycoprotein in glycoproteins:
            prepared = self.prepare_glycoprotein_for_dispatch(
                glycoprotein, builder)
            for work_item in prepared:
                i_site += 1
                self.input_queue.put(work_item)
                if i_site % 50 == 0 and i_site != 0:
                    self.input_queue.join()
        self.input_done_event.set()

    def _handle_local(self, glycoproteins, builder, seen):
        for glycoprotein in glycoproteins:
            prepared = self.prepare_glycoprotein_for_dispatch(
                glycoprotein, builder)
            for records, site, protein_stub in prepared:
                key = (protein_stub.name, site)
                if key in seen:
                    continue
                else:
                    seen[key] = -1
                    model = builder.fit_site_model(records, site, protein_stub)
                    if model is not None:
                        self.builder.site_models.append(model)

    def make_workers(self):
        for _i in range(self.n_workers):
            worker = GlycositeModelBuildingProcess(
                self.builder,
                self.input_queue,
                self.output_queue,
                producer_done_event=self.input_done_event,
                output_done_event=Event(),
                log_handler=self.ipc_manager.sender())
            self.workers.append(worker)
            worker.start()

    def clear_pool(self):
        for _i, worker in enumerate(self.workers):
            exitcode = worker.exitcode
            if exitcode != 0 and exitcode is not None:
                self.log("... Worker Process %r had exitcode %r" %
                         (worker, exitcode))
            try:
                worker.join(1)
            except AttributeError:
                pass
            if worker.is_alive():
                self.debug(
                    "... Worker Process %r is still alive and incomplete" %
                    (worker, ))
                worker.terminate()

    def all_workers_finished(self):
        """Check if all worker processes have finished.
        """
        worker_still_busy = False
        assert self.workers
        for worker in self.workers:
            try:
                is_done = worker.all_work_done()
                if not is_done:
                    worker_still_busy = True
                    break
            except (RemoteError, KeyError) as err:
                worker_still_busy = True
                self._has_remote_error = True
                break
        return not worker_still_busy

    def _fit_glycoprotein_site_models(self, glycoproteins, builder):
        self.builder = builder
        feeder_thread = Thread(target=self.feed_queue,
                               args=(glycoproteins, builder))
        feeder_thread.daemon = True
        feeder_thread.start()
        self.make_workers()
        n_sites = self.count_glycosites(glycoproteins)
        seen = dict()
        strikes = 0
        start_time = time.time()
        i = 0
        has_work = True
        while has_work:
            try:
                site_model = self.output_queue.get(True, 3)
                self.output_queue.task_done()
                key = (site_model.protein_name, site_model.position)
                seen[(key)] = i
                if key in seen:
                    self.debug(
                        "...... Duplicate Results For %s. First seen at %r, now again at %r"
                        % (
                            key,
                            seen[key],
                            i,
                        ))
                else:
                    seen[key] = i
                i += 1
                strikes = 0
                if i % 1 == 0:
                    self.log("...... Processed %d sites (%0.2f%%)" %
                             (i, i * 100. / n_sites))
                if not isinstance(site_model, EmptySite):
                    self.builder.site_models.append(site_model)
            except QueueEmptyException:
                if len(seen) == n_sites:
                    has_work = False
                # do worker life cycle management here
                elif self.all_workers_finished():
                    if len(seen) == n_sites:
                        has_work = False
                    else:
                        strikes += 1
                        if strikes % 25 == 0:
                            self.log(
                                "...... %d cycles without output (%d/%d, %0.2f%% Done)"
                                % (strikes, len(seen), n_sites,
                                   len(seen) * 100. / n_sites))
                            self.debug("...... Processes")
                            for worker in self.workers:
                                self.debug("......... %r" % (worker, ))
                            self.debug("...... IPC Manager: %r" %
                                       (self.ipc_manager, ))
                        if strikes > 1000:
                            self.log(
                                "Too much time has elapsed waiting for final results, finishing locally."
                            )
                            self._handle_local(glycoproteins, builder, seen)
                else:
                    strikes += 1
                    if strikes % 50 == 0:
                        self.log(
                            "...... %d cycles without output (%d/%d, %0.2f%% Done, %d children still alive)"
                            % (strikes, len(seen), n_sites,
                               len(seen) * 100. / n_sites,
                               len(multiprocessing.active_children()) - 1))
                        try:
                            input_queue_size = self.input_queue.qsize()
                        except Exception:
                            input_queue_size = -1
                        is_feeder_done = self.input_done_event.is_set()
                        self.log(
                            "...... Input Queue Status: %r. Is Feeder Done? %r"
                            % (input_queue_size, is_feeder_done))
                    if strikes > 1000:
                        self.log(
                            "Too much time has elapsed waiting for workers, finishing locally."
                        )
                        self._handle_local(glycoproteins, builder, seen)
                continue
        self.clear_pool()
        self.ipc_manager.stop()
        feeder_thread.join()
        dispatcher_end = time.time()
        self.log("... Dispatcher Finished (%0.3g sec.)" %
                 (dispatcher_end - start_time))

コード例 #33

0

ファイルを表示

ファイル: HadronicRunner2.py プロジェクト: felixhekhorn/PhD

class HadronicRunner2:
    def __init__(self, m2, q2, Delta, nlf, pdfs, pdfMem, mu02, aS, fs, fp, nProcesses = cpu_count()):
        # parameters
        self.m2 = m2
        self.q2 = q2
        self.Delta = Delta
        self.nlf = nlf
        self.pdfs = pdfs
        self.pdfMem = pdfMem
        self.mu02 = mu02
        self.aS = aS
        self.fs = fs
        self.fp = fp
        self.nProcesses = nProcesses
        # vars
        self.__qIn = JoinableQueue()
        self.__qOut = Queue()
        self.__js = []
        self.__jps = []
        self.__ks = []
        self.__params = []
        self.__paramps = []
        self.__data = {}
        self.__processes = []
    # setup x grid
    def _getGridX(self,Nx):
        if (Nx < 2): raise "invalid argument! Nx >= 2!"
	self.__js = range(Nx)
        self.__ks = range(len(self.fs))
        self.__params = [10.**(-4./(Nx-1)*j) for j in self.__js]
        g = []
        for proj in ["G", "L", "P"]:
            for j in self.__js:
                for k in self.__ks:
                    g.append({"proj": proj, "j": j, "x": self.__params[j], "k": k, "f": self.fs[k], "res": np.nan})
        return g
    # setup mu2 grid
    def _getGridMu2(self,x,r,Nmu2,getAlphaS):
        if (Nmu2 < 2): raise "invalid argument! Nmu2 >= 2!"
	self.__js = range(Nmu2)
        self.__ks = range(len(self.fs))
        self.__params = [r**(-1.+2./(Nmu2-1)*j) for j in self.__js]
        g = []
        for proj in ["G", "L", "P"]:
            for j in self.__js:
                mu2 = self.mu02*self.__params[j]
                aS = getAlphaS(mu2)
                for k in self.__ks:
                    g.append({"proj": proj, "x": x, "j": j, "mu2": mu2, "alphaS": aS,"k": k, "f": self.fs[k], "res": np.nan})
        return g
    # setup m2 grid
    def _getGridM2(self,x,m2min,m2max,Nm2,getMu2,getAlphaS):
        if (Nm2 < 2): raise "invalid argument! Nm2 >= 2!"
	self.__js = range(Nm2)
        self.__ks = range(len(self.fs))
        self.__params = [m2min + (m2max-m2min)/(Nm2-1)*j for j in self.__js]
        g = []
        for proj in ["G", "L", "P"]:
            for j in self.__js:
                m2 = self.__params[j]
                mu2 = getMu2(m2)
                aS = getAlphaS(mu2)
                for k in self.__ks:
                    g.append({"proj": proj, "x": x, "j": j, "m2": m2, "mu2": mu2, "alphaS": aS, "k": k, "f": self.fs[k], "res": np.nan})
        return g
    # setup muR2-muF2 grid
    def _getGridMuR2MuF2(self,x,rR,NmuR2,rF,NmuF2,getAlphaS):
        if (NmuF2 < 2 or NmuR2 < 2): raise "invalid argument! NmuF2 >= 2, NmuR2 >= 2!"
	self.__js = range(NmuR2)
	self.__jps = range(NmuF2)
        self.__ks = range(len(self.fs))
        self.__params = [rR**(-1.+2./(NmuR2-1)*j) for j in self.__js]
        self.__paramps = [rF**(-1.+2./(NmuF2-1)*jp) for jp in self.__jps]
        g = []
        for proj in ["G", "L", "P"]:
            for j in self.__js:
                muR2 = self.mu02*self.__params[j]
                aS = getAlphaS(muR2)
                for jp in self.__jps:
                    muF2 = self.mu02*self.__paramps[jp]
                    for k in self.__ks:
                        g.append({"proj": proj, "x": x, "j": j, "muR2": muR2, "alphaS": aS, "jp": jp, "muF2": muF2,"k": k, "f": self.fs[k], "res": np.nan})
        return g
    # setup pdf grid
    def _getGridPdf(self,Nx,proj, pdf, Npdfmem):
        if (Nx < 2): raise "invalid argument! Nx >= 2!"
        if (Npdfmem < 0): raise "invalid argument! Npdfmem >= 0!"
	self.__js = range(Nx)
        self.__jps = range(Npdfmem+1)
        self.__ks = range(len(self.fs))
        self.__params = [10.**(-4./(Nx-1)*j) for j in self.__js]
        g = []
        for j in self.__js:
            x = self.__params[j]
            for k in self.__ks:
                for pdfMem in self.__jps:
                    g.append({"proj": proj, "j": j, "x": x, "k": k, "f": self.fs[k], "pdf": pdf, "pdfMem": pdfMem, "res": np.nan})
        return g
    # start processes
    def _compute(self,g):
        self.__qIn = JoinableQueue()
        # fill
        for e in g:
            self.__qIn.put(e)
        print _pinfo(),"computing %d elements"%self.__qIn.qsize()
        # add EOF
        for n in xrange(self.nProcesses):
            self.__qIn.put(None)
        self.__qOut = Queue()
        # start processes
        oArgs = {
            "G": (self.m2,self.q2,self.Delta,ElProduction.projT.G,self.nlf,),
            "L": (self.m2,self.q2,self.Delta,ElProduction.projT.L,self.nlf,),
            "P": (self.m2,self.q2,self.Delta,ElProduction.projT.P,self.nlf,)
        }
        lenParams = len(g)
        processes = []
        threadArgs = (self.__qIn, self.__qOut,
                      oArgs, self.pdfs, self.pdfMem, self.mu02, self.aS, lenParams,)
        for j in xrange(self.nProcesses):
            processes.append(Process(target=_threadWorker, args=threadArgs))
        [p.start() for p in processes]
        # run
        try:
            self.__qIn.join()
        except KeyboardInterrupt:
            [p.terminate() for p in processes]
            print "\n",_pwarn(),"aborting at",self.__qOut.qsize(),"/",lenParams
            self.__qIn.close()
        sys.stdout.write("\n")
    # reorder in 1D
    def _reorder1(self):
        self.__data = {}
        self.__data["G"] = [[np.nan for k in self.__ks] for j in self.__js]
        self.__data["L"] = [[np.nan for k in self.__ks] for j in self.__js]
        self.__data["P"] = [[np.nan for k in self.__ks] for j in self.__js]
        l = self.__qOut.qsize()
        while g in range(l):
            p = self.__qOut.get()
            self.__data[p["proj"]][p["j"]][p["k"]] = p["res"]
    # reorder in 2D
    def _reorder2(self):
        self.__data = {}
        self.__data["G"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js]
        self.__data["L"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js]
        self.__data["P"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js]
        l = self.__qOut.qsize()
        for g in range(l):
            p = self.__qOut.get()
            self.__data[p["proj"]][p["j"]][p["jp"]][p["k"]] = p["res"]
    # reorder in pdf data
    def _reorderPdf(self):
        self.__data = {}
        self.__data["G"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js]
        self.__data["L"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js]
        self.__data["P"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js]
        l = self.__qOut.qsize()
        for g in range(l):
            p = self.__qOut.get()
            self.__data[p["proj"]][p["j"]][p["k"]][p["pdfMem"]] = p["res"]
    # write data for 1D
    def _write1(self):
        with open(self.fp, "w") as f:
            for j in self.__js:
                x = self.__params[j]
                l = ["%e"%x]
                data2 = [self.__data["G"][j][k]+self.__data["L"][j][k]*3./2. for k in self.__ks]
                for k in self.__ks:
                    l.append("%e"%data2[k])
                for k in self.__ks:
                    l.append("%e"%self.__data["L"][j][k])
                for k in self.__ks:
                    l.append("%e"%self.__data["P"][j][k])
                f.write("\t".join(l)+"\n")
    # write data for 2D
    def _write2(self):
        with open(self.fp, "w") as f:
            for j in self.__js:
                x = self.__params[j]
                for jp in self.__jps:
                    xp = self.__paramps[jp]
                    l = ["%e"%x, "%e"%xp]
                    data2 = [self.__data["G"][j][jp][k]+self.__data["L"][j][jp][k]*3./2. for k in self.__ks]
                    for k in self.__ks:
                        l.append("%e"%data2[k])
                    for k in self.__ks:
                        l.append("%e"%self.__data["L"][j][jp][k])
                    for k in self.__ks:
                        l.append("%e"%self.__data["P"][j][jp][k])
                    f.write("\t".join(l)+"\n")
                f.write("\n")
    # write data for pdf data
    def _writePdf(self,proj):
        with open(self.fp, "w") as f:
            for j in self.__js:
                x = self.__params[j]
                l = ["%e"%x]
                d = self.__data[proj][j]
                for k in self.__ks:
                    l.append("%e"%np.min(d[k]))
                    l.append("%e"%(d[k][0]))
                    l.append("%e"%np.max(d[k]))
                f.write("\t".join(l)+"\n")
    # compute grid in 1D
    def _run1(self,g):
        if len(g) == 0:
            print _pwarn(),"no data!"
            return
        self._compute(g)
        self._reorder1()
        self._write1()
    # compute grid in 2D
    def _run2(self,g):
        if len(g) == 0:
            print _pwarn(),"no data!"
            return
        self._compute(g)
        self._reorder2()
        self._write2()
    # iterate x
    def runX(self,Nx):
        self._run1(self._getGridX(Nx))
    # iterate mu2
    def runMu2(self,x,r,Nmu2,getAlphaS):
        self._run1(self._getGridMu2(x,r,Nmu2,getAlphaS))
    # iterate m2
    def runM2(self,x,m2min,m2max,Nm2,getMu2,getAlphaS):
        self._run1(self._getGridM2(x,m2min,m2max,Nm2,getMu2,getAlphaS))
    # iterate muR2 and muF2
    def runMuR2MuF2(self,x,rR,NmuR2,rF,NmuF2,getAlphaS):
        self._run2(self._getGridMuR2MuF2(x,rR,NmuR2,rF,NmuF2,getAlphaS))
    def runPdf(self,Nx,proj, pdf, Npdfmem):
        g = self._getGridPdf(Nx,proj, pdf, Npdfmem)
        if len(g) == 0:
            print _pwarn(),"no data!"
            return
        self._compute(g)
        self._reorderPdf()
        self._writePdf(proj)

コード例 #34

0

ファイルを表示

    def _parallely_make_dataset(self):
        #Get video_list and video_length files if they exist
        name_file = "{}/video_list.npy".format(self.loc)
        len_file = "{}/video_lengths.npy".format(self.loc)
        if os.path.isfile(name_file):
            video_list = np.load(name_file)
            video_lengths = np.load(len_file)
            return video_list, video_lengths

        #Files don't yet exist, so create them
        q = Queue()
        qvideo_list = Queue()

        #Collect list of videos to index
        fnames_list = []
        if self.split_file != None:
            with open(self.split_file, 'r') as f:
                line = f.readline()
                while line:
                    fnames_list.append(
                        self.line_to_fname(
                            os.path.join(self.data_dir, line.strip())))
                    line = f.readline()
        else:
            for root, _, fnames in tqdm(os.walk(self.data_dir)):
                for fname in sorted(fnames):
                    fnames_list.append(os.path.join(root, fname))

        #Truncate list if necessary
        if self.limit is not None:
            fnames_list = fnames_list[:self.limit]

        #Parallely open videos, get length
        def parallel_worker(fnames_chunk):
            item = q.get()
            for fname in tqdm(fnames_chunk):
                if has_file_allowed_extension(fname, VIDEO_EXTENSION):
                    video_path = fname
                    vc = cv2.VideoCapture(video_path)
                    length = int(vc.get(cv2.CAP_PROP_FRAME_COUNT))
                    if length > 0 and vc.isOpened():
                        qvideo_list.put((video_path, length))
                        qvideo_list.task_done()
                    vc.release()
            q.task_done()

        processes = self.parallel_processes
        if self.limit is not None and processes >= self.limit:
            processes = self.limit

        n = len(fnames_list)
        chunk = int(n / processes)
        if chunk == 0:
            chunk = 1
        fnames_chunks = [fnames_list[i*chunk:(i+1)*chunk] \
                        for i in range((n + chunk - 1) // chunk)]
        for i in range(processes):
            q.put(i)
            multiprocessing.Process(target=parallel_worker,
                                    args=(fnames_chunks[i], )).start()

        q.join()
        qvideo_list.join()

        video_list = []
        video_lengths = []

        while qvideo_list.qsize() != 0:
            video, length = qvideo_list.get()
            video_list.append(video)
            video_lengths.append(length)

        np.save(name_file, video_list)
        np.save(len_file, video_lengths)

        return video_list, video_lengths

コード例 #35

0

ファイルを表示

ファイル: findText.py プロジェクト: LTD-Beget/sprutio-rpc

class FindText(BaseWorkerCustomer):
    NUM_WORKING_PROCESSES = 2

    def __init__(self, params, *args, **kwargs):
        super(FindText, self).__init__(*args, **kwargs)

        self.path = params.get("path", "/")
        self.text = params.get("text", "")

        self.params = params

        # file queue to be processed by many threads
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)
        self.result = []

        self.is_alive = {"status": True}

        self.re_text = re.compile(".*" + fnmatch.translate(self.text)[:-7] + ".*", re.UNICODE | re.IGNORECASE)
        # remove \Z(?ms) from end of result expression

    def run(self):
        try:
            self.preload()
        except Exception as e:
            result = {"error": True, "message": str(e), "traceback": traceback.format_exc()}

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)
            return

        def worker(re_text, file_queue, result_queue, logger, timeout):
            while int(time.time()) < timeout:
                if file_queue.empty() is not True:
                    f_path = file_queue.get()
                    try:
                        if not is_binary(f_path):
                            mime = mimetypes.guess_type(f_path)[0]

                            # исключаем некоторые mime типы из поиска
                            if mime not in ["application/pdf", "application/rar"]:
                                with open(f_path, "rb") as fp:
                                    for line in fp:
                                        try:
                                            line = as_unicode(line)
                                        except UnicodeDecodeError:
                                            charset = chardet.detect(line)
                                            if charset.get("encoding") in ["MacCyrillic"]:
                                                detected = "windows-1251"
                                            else:
                                                detected = charset.get("encoding")

                                            if detected is None:
                                                break
                                            try:
                                                line = str(line, detected, "replace")
                                            except LookupError:
                                                pass

                                        if re_text.match(line) is not None:
                                            result_queue.put(f_path)
                                            # logger.debug("matched file = %s " % f_path)
                                            break

                    except UnicodeDecodeError as unicode_e:
                        logger.error("UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                    except IOError as io_e:
                        logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                    except Exception as other_e:
                        logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                    finally:
                        file_queue.task_done()
                else:
                    time.sleep(REQUEST_DELAY)

        try:
            self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT)
            time_limit = int(time.time()) + TIMEOUT_LIMIT
            # Launches a number of worker threads to perform operations using the queue of inputs
            for i in range(self.NUM_WORKING_PROCESSES):
                p = Process(
                    target=worker, args=(self.re_text, self.file_queue, self.result_queue, self.logger, time_limit)
                )
                p.start()
                proc = psutil.Process(p.pid)
                proc.ionice(psutil.IOPRIO_CLASS_IDLE)
                proc.nice(20)
                self.logger.debug(
                    "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (str(i), str(p.pid))
                )
                self.processes.append(p)

            abs_path = self.get_abs_path(self.path)
            self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path)

            if not os.path.exists(abs_path):
                raise Exception("Provided path not exist")

            self.on_running(self.status_id, pid=self.pid, pname=self.name)
            for current, dirs, files in os.walk(abs_path):
                for f in files:
                    try:
                        file_path = os.path.join(current, f)
                        self.file_queue.put(file_path)

                    except UnicodeDecodeError as e:
                        self.logger.error("UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

                    except IOError as e:
                        self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

                    except Exception as e:
                        self.logger.error("Exception %s, %s" % (str(e), traceback.format_exc()))

            while int(time.time()) <= time_limit:
                self.logger.debug(
                    "file_queue size = %s , empty = %s (timeout: %s/%s)"
                    % (self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)
                )
                if self.file_queue.empty():
                    self.logger.debug("join() file_queue until workers done jobs")
                    self.file_queue.join()
                    break
                else:
                    time.sleep(REQUEST_DELAY)

            if int(time.time()) > time_limit:
                self.is_alive["status"] = False

            for p in self.processes:
                try:
                    self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid)
                    kill(p.pid, signal.SIGKILL, self.logger)
                except OSError:
                    self.logger.error("FindText unable to terminate worker process, pid = %s" % p.pid)

            if self.is_alive["status"] is True:
                while not self.result_queue.empty():
                    file_path = self.result_queue.get()
                    self.result.append(self._make_file_info(file_path))

                self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name)
            else:
                result = {"error": True, "message": "Operation timeout exceeded", "traceback": ""}
                self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

        except Exception as e:
            result = {"error": True, "message": str(e), "traceback": traceback.format_exc()}

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

コード例 #36

0

ファイルを表示

ファイル: find_by_grammar.py プロジェクト: tbraun89/research-textmining

def search(output_dict, rules_file):
    rules = [rule.split(' | ') for rule in pickle.load(open(rules_file, 'rb'))]
    file_list = JoinableQueue()
    word_dict = Manager().dict()

    for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'tagged')):
        for current_file in files:
            if current_file.endswith(".pickle"):
                file_list.put(os.path.join(root, current_file))
                #break  # TODO remove (only for testing with one file)

    file_count = file_list.qsize()

    def worker():
        def rule_parser(tagged_data):
            parser = nltk.RegexpParser('''
                NP:   {<NN|NNS|NNP|NNPS|NE>}
                NPs:  {<NP> (<,|CC> <NP>)+}
            ''')

            return parser.parse(tagged_data)

        def get_nltk_word(data):
            if isinstance(data, nltk.tree.Tree):
                if isinstance(data[0], tuple):
                    return data[0][0]
                else:
                    return data[0]
            else:
                return data[0]

        def add_to_dict(hypernym, hyponym):
            if not hyponym in word_dict.keys():
                old_list = word_dict.get(hypernym)

                if not old_list:
                    old_list = [hyponym]
                else:
                    if not hyponym in old_list:
                        old_list.append(hyponym)

                word_dict[hypernym] = old_list

        def apply_rules(data, position):
            for rule in rules:
                # search right side
                if rule[0] == 'HYPERNYM':
                    possible_hypernym = get_nltk_word(data[position])
                    error = False
                    word_count = 1

                    for word in rule[1:-1]:
                        try:
                            if word != get_nltk_word(data[position + word_count]):
                                error = True

                            word_count += 1
                        except IndexError:
                            pass

                    try:
                        if not error:
                            if isinstance(data[position + word_count], nltk.tree.Tree):
                                if data[position + word_count].node == 'NP' and rule[-1] == 'NP':
                                    add_to_dict(possible_hypernym, data[position + word_count][0][0])
                                    break
                                elif data[position + word_count].node == 'NPs' and rule[-1] == 'NPs':
                                    for node in data[position + word_count]:
                                        if isinstance(node, nltk.tree.Tree):
                                            add_to_dict(possible_hypernym, node[0][0])
                                            break
                    except IndexError:
                        pass

                # search left side
                elif rule[-1] == 'HYPERNYM':
                    possible_hypernym = get_nltk_word(data[position])
                    error = False
                    word_count = -1
                    nrule = list(rule)
                    nrule.reverse()

                    for word in nrule[1:-1]:
                        try:
                            if word != get_nltk_word(data[position + word_count]):
                                error = False

                            word_count -= 1
                        except IndexError:
                            pass

                    try:
                        if not error:
                            if isinstance(data[position + word_count], nltk.tree.Tree):
                                if data[position + word_count].node == 'NP' and rule[-1] == 'NP':
                                    add_to_dict(possible_hypernym, data[position + word_count][0][0])
                                    break
                                elif data[position + word_count].node == 'NPs' and rule[-1] == 'NPs':
                                    for node in data[position + word_count]:
                                        if isinstance(node, nltk.tree.Tree):
                                            add_to_dict(possible_hypernym, node[0][0])
                                            break
                    except IndexError:
                        pass

        while not file_list.empty():
            input_file = file_list.get()

            tagged_data = rule_parser(pickle.load(open(input_file, 'rb')))

            for n in range(len(tagged_data)):
                if isinstance(tagged_data[n], nltk.tree.Tree):
                    if tagged_data[n].node == 'NP':
                        apply_rules(tagged_data, n)

            percentage = 100.0 - ((float(file_list.qsize()) / float(file_count)) * 100.0)
            sys.stdout.write("\rProgress: {0:.2f}%".format(percentage))
            sys.stdout.flush()

            file_list.task_done()

    sys.stdout.write("\rProgress: 0.00%")

    for pid in range(8):
        process = Process(target=worker, args=())
        process.daemon = True
        process.start()

    file_list.join()
    print('')

    pickle_dict = dict()

    for key in word_dict.keys():
        pickle_dict[key] = word_dict.get(key)

    pickle.dump(pickle_dict, open(output_dict, 'wb+'), 2)

コード例 #37

0

ファイルを表示

class FindText(BaseWorkerCustomer):
    NUM_WORKING_PROCESSES = 2

    def __init__(self, params, session, *args, **kwargs):
        super(FindText, self).__init__(*args, **kwargs)

        self.path = params.get('path', '/')
        self.session = session
        self.session = session
        self.text = params.get('text', '')

        self.params = params

        # file queue to be processed by many threads
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)
        self.result = []

        self.is_alive = {
            "status": True
        }

        self.re_text = re.compile('.*' + fnmatch.translate(self.text)[:-7] + '.*',
                                  re.UNICODE | re.IGNORECASE)
        # remove \Z(?ms) from end of result expression

    def run(self):
        try:
            self.preload()
            sftp = self.get_sftp_connection(self.session)

            self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT)
            time_limit = int(time.time()) + TIMEOUT_LIMIT
            # Launches a number of worker threads to perform operations using the queue of inputs
            sftp_managers = []
            for i in range(self.NUM_WORKING_PROCESSES):
                p = Process(target=self.worker,
                            args=(self.re_text, self.file_queue, self.result_queue, time_limit))
                p.start()
                proc = psutil.Process(p.pid)
                proc.ionice(psutil.IOPRIO_CLASS_IDLE)
                proc.nice(20)
                self.logger.debug(
                    "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (
                        str(i), str(p.pid)))
                self.processes.append(p)

            abs_path = self.path
            self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path)

            if not sftp.exists(abs_path):
                raise Exception("Provided path not exist")

            self.on_running(self.status_id, pid=self.pid, pname=self.name)
            for current, dirs, files in sftp.walk(abs_path):
                for f in files:
                    try:
                        file_path = os.path.join(current, f)
                        self.file_queue.put(file_path)

                    except UnicodeDecodeError as e:
                        self.logger.error(
                            "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

                    except IOError as e:
                        self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

                    except Exception as e:
                        self.logger.error(
                            "Exception %s, %s" % (str(e), traceback.format_exc()))

            while int(time.time()) <= time_limit:
                self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % (
                    self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit))
                if self.file_queue.empty():
                    self.logger.debug("join() file_queue until workers done jobs")
                    self.file_queue.join()
                    break
                else:
                    time.sleep(REQUEST_DELAY)

            if int(time.time()) > time_limit:
                self.is_alive['status'] = False

            for sftp in sftp_managers:
                sftp.conn.close()

            for p in self.processes:
                try:
                    self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid)
                    kill(p.pid, signal.SIGKILL, self.logger)
                except OSError:
                    self.logger.error(
                        "FindText unable to terminate worker process, pid = %s" % p.pid)

            if self.is_alive['status'] is True:
                while not self.result_queue.empty():
                    file_path = self.result_queue.get()
                    self.result.append(sftp.make_file_info(file_path))

                self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name)
            else:
                result = {
                    "error": True,
                    "message": "Operation timeout exceeded",
                    "traceback": ""
                }
                self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

        except Exception as e:
            result = {
                "error": True,
                "message": str(e),
                "traceback": traceback.format_exc()
            }

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

    def worker(self, re_text, file_queue, result_queue, timeout):
        try:
            worker_sftp = self.get_sftp_connection(self.session)
            while int(time.time()) < timeout:
                if file_queue.empty() is not True:
                    f_path = file_queue.get()
                    try:
                        if not worker_sftp.is_binary(f_path):
                            mime = mimetypes.guess_type(f_path)[0]

                            # исключаем некоторые mime типы из поиска
                            if mime not in ['application/pdf', 'application/rar']:
                                with worker_sftp.open(f_path, 'rb') as fp:
                                    for line in fp:
                                        try:
                                            line = as_unicode(line)
                                        except UnicodeDecodeError:
                                            charset = chardet.detect(line)
                                            if charset.get('encoding') in ['MacCyrillic']:
                                                detected = 'windows-1251'
                                            else:
                                                detected = charset.get('encoding')

                                            if detected is None:
                                                break
                                            try:
                                                line = str(line, detected, "replace")
                                            except LookupError:
                                                pass

                                        if re_text.match(line) is not None:
                                            result_queue.put(f_path)
                                            self.logger.debug("matched file = %s " % f_path)
                                            break

                    except UnicodeDecodeError as unicode_e:
                        self.logger.error(
                            "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                    except IOError as io_e:
                        self.logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                    except Exception as other_e:
                        self.logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                    finally:
                        file_queue.task_done()
                else:
                    time.sleep(REQUEST_DELAY)
            worker_sftp.close()

        except Exception as e:
            result = {
                "error": True,
                "message": str(e),
                "traceback": traceback.format_exc()
            }

            self.logger.error('SFTP FindText Worker Exception {}'.format(result))

コード例 #38

0

ファイルを表示

    def optimized_compute(self):
        """
        First process: it computes the quilt algorithm with big tiles,
        manages child processes and them combines the results.

         1) creates the child processes (number defined according to the
            available cores and the number of big tiles in the image)
         2) computes quilting with big tiles
         3) every time a tile is computed (and sewed with the image), it is put
            in a queue
        process 1: big tiles
         for each of the tile: process n

        """

        self.log.info('\nMULTIPROCESSING COMPUTING ...')

        big_num_tiles = self.calc_num_tiles(tile_size=self.big_tilesize,
                                            overlap=self.big_overlap)

        # prepare the pool
        n_proc = min(big_num_tiles[0] * big_num_tiles[1], self.cores)
        out_queue = Queue()
        in_queue = JoinableQueue()
        pool = Pool(n_proc, unwrap_self, (
            self,
            in_queue,
            out_queue,
        ))
        self.log.info('preparing {0} processes - {1}'.format(
            n_proc, time.strftime("%H:%M:%S")))

        if self.Ymask is not None:
            # zero values will become inf
            Ymask_rgb = gray2rgb(self.Ymask)
            # use the mask as a draft of the dst img so that boundaries are
            # respected
            self.Y[0] = deepcopy(Ymask_rgb)

        for i in xrange(big_num_tiles[0]):

            startI = i * self.big_tilesize - i * self.big_overlap
            endI = min(self.Y[0].shape[0], startI + self.big_tilesize)
            sizeI = endI - startI
            if sizeI <= self.overlap:
                continue

            for j in xrange(big_num_tiles[1]):

                startJ = j * self.big_tilesize - j * self.big_overlap
                endJ = min(self.Y[0].shape[1], startJ + self.big_tilesize)
                sizeJ = endJ - startJ
                if sizeJ <= self.overlap:
                    continue

                dst_patches = [y[startI:endI, startJ:endJ, :] for y in self.Y]
                # for the big tiles don't consider the mask, since it would
                # remove most of the image because the tiles are so big
                res_patches = self._compute_patch(
                    dst_patches, [sizeI, sizeJ], (i, j),
                    mask=self.Xmask_big,
                    constraint_start=self.constraint_start,
                    err=0.8)

                # add the mask on top
                if self.Ymask is not None:
                    res_patches = [
                        r * Ymask_rgb[startI:endI, startJ:endJ]
                        for r in res_patches
                    ]
                for idx, res in enumerate(res_patches):
                    self.Y[idx][startI:endI, startJ:endJ, :] = res

                # make a process start in this big tile
                _img = [y[startI:endI, startJ:endJ, :] for y in self.Y]
                _mask = self.Ymask[startI:endI, startJ:endJ] \
                    if self.Ymask is not None else None
                _id = (startI, startJ)
                in_queue.put({'dst': _img, 'mask': _mask, 'id': _id})

        # wait for all the children
        self.log.debug('master finished {0}'.format(time.strftime("%H:%M:%S")))
        show(self.Y[0]) if self.debug else None
        pool.close()
        self.log.debug('closed, in queue: {0} out: {1}'.format(
            in_queue.qsize(), out_queue.qsize()))
        in_queue.join()
        self.log.debug('all children finished {0}'.format(
            time.strftime("%H:%M:%S")))

        # get the results
        results = sorted([
            out_queue.get()
            for _ in xrange(big_num_tiles[0] * big_num_tiles[1])
        ])

        # sew them together
        for idx, res in results:

            # calculate the mask
            base_patch = self.Y[0][idx[0]:idx[0] + self.big_tilesize,
                                   idx[1]:idx[1] + self.big_tilesize]
            new_patch = res[0]
            mask_patch = self.calc_patch_mask(base_patch,
                                              new_patch,
                                              coord=idx,
                                              overlap=self.big_overlap)
            # apply the mask to each layer
            for i, y in enumerate(self.Y):
                base_patch = y[idx[0]:idx[0] + self.big_tilesize,
                               idx[1]:idx[1] + self.big_tilesize]
                new_patch = res[i]
                self.Y[i][idx[0]:idx[0]+self.big_tilesize,
                          idx[1]:idx[1]+self.big_tilesize, :] = \
                    filter_img(new_patch, base_patch, mask_patch)

        # apply the mask again
        if self.Ymask is not None:
            self.Y = [r * Ymask_rgb for r in self.Y]

        show(self.Y[0]) if self.debug else None
        if self.result_path:
            save(self.Y[0], self.result_path)
            self.log.info('saving' + self.result_path)

コード例 #39

0

ファイルを表示

ファイル: grammars.py プロジェクト: tbraun89/research-textmining

def learning(input_dict, output_rules):
    def worker():
        def rule_parser(tagged_data):
            parser = nltk.RegexpParser('''
                NP:   {<NN|NNS|NNP|NNPS|NE>}
                NPs:  {<NP> (<,|CC> <NP>)+}
            ''')

            return parser.parse(tagged_data)

        def find_hypernyms(pre_parsed_data, hypernym_list):
            hypernym_positions = []

            for n in range(len(pre_parsed_data)):
                if isinstance(pre_parsed_data[n], nltk.tree.Tree):
                    if pre_parsed_data[n].node == 'NP':
                        if pre_parsed_data[n][0][0] in hypernym_list:
                            hypernym_positions.append((n, pre_parsed_data[n][0][0]))

            return hypernym_positions

        def find_pattern(pre_parsed_data, hypernym, hypernym_list):
            left = []
            right = []
            start_pos = hypernym[0]

            def add_to_list(current_list, element, position, has_hyponym):
                try:
                    if isinstance(element[position], nltk.tree.Tree):
                        current_list.append((element[position][0][0], element[position].node))

                        if element[position].node == 'NP':
                            if element[position][0][0] in hypernym_list[hypernym[1]]:
                                has_hyponym.append(len(current_list))

                        elif element[position].node == 'NPs':
                            for possible_np in element[position][0]:
                                if isinstance(possible_np, nltk.tree.Tree):
                                    if possible_np[0][0] in hypernym_list[hypernym[1]]:
                                        has_hyponym.append(len(current_list))
                    else:
                        current_list.append((element[position][0], element[position][1]))
                except IndexError:
                    pass

            # search the right side for a pattern
            has_hyponym = []
            for i in range(start_pos, start_pos + MAX_SEARCH_RANGE, 1):
                add_to_list(right, pre_parsed_data, i, has_hyponym)
            if has_hyponym:
                return 'right', right[:has_hyponym[-1]]

            has_hyponym = []
            # search the left side for a pattern
            for i in range(start_pos, start_pos - MAX_SEARCH_RANGE, -1):
                add_to_list(left, pre_parsed_data, i, has_hyponym)
            if has_hyponym:
                left.reverse()
                return 'left', left[(MAX_SEARCH_RANGE - has_hyponym[-1]):]

            return None

        def add_rule(rule):
            if len(rule[1]) >= 3:
                rts = []

                count = 0
                for element in rule[1]:
                    if element[1] == 'NPs' or element[1] == 'NP':
                        if count == 0 or count == len(rule[1]) - 1:
                            rts.append(element[1])
                        else:
                            if element[1] == 'NP':
                                rts.append(element[0])
                            else:
                                for node in element[1]:
                                    rts.append(node[0])
                    else:
                        rts.append(element[0])

                    count += 1

                if rule[0] == 'left':
                    rts[len(rule[1]) - 1] = 'HYPERNYM'
                else:
                    rts[0] = 'HYPERNYM'

                rts_str = ' | '.join(rts)

                if rts_str in rules:
                    rules[rts_str] += 1
                else:
                    rules[rts_str] = 1

        while not file_list.empty():
            input_file = file_list.get()

            tagged_data = pickle.load(open(input_file, 'rb'))

            pre_parsed_data = rule_parser(tagged_data)

            hypernym_positions = find_hypernyms(pre_parsed_data, h_dict.keys())

            for hypernym in hypernym_positions:
                rule = find_pattern(pre_parsed_data, hypernym, h_dict)

                if rule:
                    add_rule(rule)

            percentage = 100.0 - ((float(file_list.qsize()) / float(file_count)) * 100.0)
            sys.stdout.write("\rProgress: {0:.2f}%".format(percentage))
            sys.stdout.flush()

            file_list.task_done()

    def blacklist_filter(rule):
        result = True
        number_blacklist = []

        for i in range(100):
            number_blacklist.append(num2words(i).encode('ascii'))

        for word in (WORD_BLACKLIST + number_blacklist):
            if word in rule:
                result = False

        return result

    h_dict = pickle.load(open(input_dict, 'rb'))

    rules = Manager().dict()
    file_list = JoinableQueue()
    sys.stdout.write("\rProgress: 0.00%")

    for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'tagged')):
        for current_file in files:
            if current_file.endswith(".pickle"):
                file_list.put(os.path.join(root, current_file))

    file_count = file_list.qsize()

    for pid in range(8):
        process = Process(target=worker, args=())
        process.daemon = True
        process.start()

    file_list.join()
    print('')

    # filter the rules and save them
    filtered_rules = {k: v for k, v in rules.items() if v >= THRESHOLD and blacklist_filter(k)}
    rule_list = sorted(filtered_rules.keys(), key=filtered_rules.get)
    rule_list.reverse()

    pickle.dump(rule_list, open(output_rules, 'wb+'), 2)

コード例 #40

0

ファイルを表示

ファイル: multiproctest.py プロジェクト: pombredanne/nlpsandbox

                self.outputqueue.put( self.compute(n))


if __name__ == '__main__':
    begintime = time.time()

    inputqueue = Queue()
    outputqueue = Queue()
    threads = int(sys.argv[1])

    feeder = Feeder(inputqueue,outputqueue, threads)
    feeder.start()
    duration = time.time() - begintime
    print("Feeder started (" + str(duration) + "s)")


    for _ in range(0,threads):
        processor = Processor(inputqueue,outputqueue)
        processor.start()

    inputqueue.join()
    duration = time.time() - begintime
    print("Inputqueue done (" + str(duration) + "s)")
    outputqueue.put(None)

    print("Outputqueue length (" + str(outputqueue.qsize()) + ")")
    feeder.join()
    duration = time.time() - begintime
    print("Outputqueue done (" + str(duration) + "s)")

コード例 #41

0

ファイルを表示

ファイル: dir_q.py プロジェクト: brl0/bripy

    return [*file_hashes]


if __name__ == '__main__':
    freeze_support()
    print(perf_counter())
    with EXECUTOR(max_workers=MAX_WORKERS) as executor:
        in_dirs = 0
        out_dirs = 1
        in_list = list()
        out_list = [
            basepath,
        ]
        hash_count = 0
        sleeps = 0
        while not q.empty() and q.qsize():
            item = q.get()
            print(f'{in_dirs}: {item}')
            in_list.append(item)
            in_dirs += 1
            future_dirs = executor.submit(dir_worker, item)
            dirs = future_dirs.result()
            q_dirs = [*map(q.put, dirs)]
            #print(dirs)
            out_dirs += len(dirs)
            out_list.extend(dirs)
            future_hashes = executor.submit(hash_worker, item)
            q_hashes = [*map(hash_q.put, dirs)]
            hashes = future_hashes.result()
            print(hashes)
            hash_count += len(hashes)

コード例 #42

0

ファイルを表示

ファイル: tagger.py プロジェクト: tbraun89/research-textmining

import nltk
import os
import sys
import re
import pickle

nltk.data.path.append(os.path.join(os.path.dirname(__file__), 'nltk'))

file_list = JoinableQueue()

for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'plain')):
        for current_file in files:
            if current_file.endswith(".txt"):
                file_list.put(os.path.join(root, current_file))

file_count = file_list.qsize()


def worker():
    while not file_list.empty():
        input_file = file_list.get()

        with open(input_file, 'r') as c_file:
            contents = c_file.read()
            contents = contents.replace(' {2,}', '')
            contents = re.sub('\s{2,}', ' ', contents)

        tokens = [word for sent in nltk.sent_tokenize(contents) for word in nltk.word_tokenize(sent)]
        pos_tagged = nltk.pos_tag(tokens)
        pos_tagged = nltk.ne_chunk(pos_tagged)

コード例 #43

0

ファイルを表示

ファイル: mergeh5.py プロジェクト: TIGRLab/NI-ML

    shape = image.shape


    f = h5py.File('/dev/shm/test.h5','w')
    data = f.create_dataset('data', (len(minc_volumes),) + image.shape, 
            chunks=(CHUNKSIZE,) + image.shape, compression='gzip')
    names = f.create_dataset('names', (len(minc_volumes),), 
            dtype=h5py.special_dtype(vlen=unicode))

    vf.close()

    q = JoinableQueue()
    p = Process(target=enqueue_volumes, args=(q,minc_volumes))
    p.start()

    while True: 
        item = q.get(block=True)
        q.task_done()

        if item is None: 
            break
        i, j, zeros, namelist = item

        print 'data[{}:{},:] = zeros[:{},:]'.format(i, i+j+1, j+1)
        data[i:i+j+1,:] = zeros[:j+1,:]
        print 'names[{}:{}] = namelist'.format(i, i+j+1)
        print "qsize:", q.qsize()
        names[i:i+j+1] = namelist

    f.close()

コード例 #44

0

ファイルを表示

    pair_buffer = {}
    scaffold_count = {}
    #     while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0:
    while True:
        if args.debug: print("get")
        try:
            procid, scaffold, pairs = inq.get()
            #               procid,scaffold,pairs = inq.get(True,10)
            #print("#got data:",procid,scaffold,len(pairs))
            print("#got data from inq:",
                  procid,
                  scaffold,
                  len(pairs),
                  inq.empty(),
                  inq.qsize(),
                  inq.full(),
                  strftime("%Y-%m-%d %H:%M:%S"),
                  sum([reader.is_alive() for reader in readers]),
                  "q.size():",
                  q.qsize(),
                  file=sys.stderr,
                  sep="\t")
            sys.stderr.flush()
            sys.stdout.flush()
        except Exception as e:
            print(e, file=sys.stderr)
            if args.top:
                print("queue get timed out",
                      [reader.cpu_percent() for reader in reader_procs],
                      [worker.cpu_percent() for worker in worker_procs])

コード例 #45

0

ファイルを表示

ファイル: parallelization.py プロジェクト: ImagingDataCommons/PyDicomVerify

class ProcessPool:
    def __init__(self,
                 max_number_of_processs: int,
                 process_name_prifix: str = ''):
        self.logger = logging.getLogger(__name__)
        self._queue = JoinableQueue()
        self._res_queue = Queue()
        self._process_pool = []
        self.output = []
        self._lock = Lock()
        self._processes_count = max_number_of_processs
        for i in range(max_number_of_processs):
            self._process_pool.append(
                self._create_pr('{}{:02d}'.format(process_name_prifix, i)))

    def _create_pr(self, th_name) -> WorkerProcess:
        t = WorkerProcess(self._queue,
                          self._res_queue,
                          self._lock,
                          name=th_name,
                          status_log_interval=self._processes_count * 20)
        t.daemon = True
        t.start()
        return t

    @property
    def queue(self):
        return self._queue

    def kill_them_all(self, timeout=5):
        logger = logging.getLogger(__name__)
        logger.debug('closing all processs')
        for t in self._process_pool:
            # I'm putting none to push queue out of block
            self._queue.put((None, None))
        logger.debug('collecting all output data from processs')
        self._res_queue.put(None)
        if sys.platform == "linux" or sys.platform == "linux2" or\
                    sys.platform == "win32":
            output_count = self._res_queue.qsize()
        else:
            output_count = None

        # result = self._res_queue.get()
        collected = 0
        number_of_none_outputs = 0
        none_indeces = []
        while True:
            try:
                result = self._res_queue.get_nowait()
                collected += 1
            except queue.Empty:
                if output_count is not None:
                    if collected < output_count:
                        continue
                    else:
                        break
                else:
                    break
            if result is None:
                number_of_none_outputs += 1
                none_indeces.append(collected - 1)
            else:
                self.output.append(result)
        logger.info('from all {} ouputs {} were collected {} '
                    'of them were None. None indeces: {}'.format(
                        output_count, len(self.output), number_of_none_outputs,
                        none_indeces))
        logger.debug('data were collected waiting for processses to join')
        for t in self._process_pool:
            t.join(timeout)
        logger.debug('Processses joined successfully -  now closing them all')
        for t in self._process_pool:
            try:
                t.close()
            except ValueError as err:
                logger.error(err, exc_info=True)
                logger.info('Closing the process was not seuccessful.'
                            ' I will terminate it')
                t.terminate()
        logger.debug('processs all closed')

    def get_status(self) -> str:
        stat = ''
        input_qsz = self._queue.qsize()
        output_qsz = self._res_queue.qsize()
        stat += ("{} processes with inputs ({}) and outputs({})\n".format(
            len(self._process_pool), input_qsz, output_qsz))
        for ps in self._process_pool:
            name = ps.name
            st = 'alive' if ps.is_alive() else 'dead'
            stat += ('\t\tps {} is {}\n'.format(name, st))
        return stat

コード例 #46

0

ファイルを表示

def bulk(bucket_name,
         prefix,
         concurrency,
         glob_load,
         resume_load,
         dest_dataset=None,
         alias=None,
         cluster_by=(),
         drop=(),
         rename={},
         replace=()):
    """
    Load data into BigQuery concurrently
    Args:
        bucket_name: gcs bucket name (str)
        prefix: object key path, 'dataset/version' (str)
        concurrency: number of processes to handle the load (int)
        glob_load: load data by globbing path dirs (boolean)
        resume_load: resume load (boolean)
        dest_dataset: override default dataset location (str)
        alias: override object key derived table name (str)
        cluster_by: top level fields to cluster by (Tuple[str])
        drop: top level fields to exclude (Tuple[str])
        rename: top level fields to rename (Dict[str,str])
        replace: top field replacement expressions (Tuple[str])
    """

    _dest_dataset = dest_dataset or DEFAULT_DATASET

    logging.info('main_process: dataset set to {}'.format(_dest_dataset))

    q = JoinableQueue()
    msg = Queue()
    lock = Lock()

    if glob_load:
        logging.info('main_process: loading via glob method')
        object_keys = get_latest_object(bucket_name, prefix)
        if resume_load:
            object_keys = remove_loaded_objects(object_keys, _dest_dataset,
                                                alias, rename)

        for path, object_key in object_keys.items():
            q.put((bucket_name, path, object_key))
    else:
        logging.info('main_process: loading via non-glob method')
        object_keys = list_blobs_with_prefix(bucket_name, prefix)
        for object_key in object_keys:
            q.put((bucket_name, None, object_key))

    args = (lock, q, msg, _dest_dataset, alias, cluster_by, drop, rename,
            replace)
    for c in range(concurrency):
        p = Process(target=_bulk_run, args=(c, ) + args)
        p.daemon = True
        p.start()

    logging.info('main_process: {} total tasks in queue'.format(q.qsize()))

    q.join()

    for c in range(concurrency):
        q.put(None)

    p.join()

    # if the msg queue is not empty it indicates that an exception occured in
    # the child process. exit 1 to indicate a failure event.
    # we only send error type messages right now but in the future we could
    # potentially have others
    if msg.empty():
        logging.info('main_process: done')
        exit(0)
    else:
        logging.error('main_process: exceptions occured in child processes')
        while not msg.empty():
            error_msg = msg.get()
            logging.error('main_process: {} had error {}'.format(
                error_msg[1], error_msg[2]))
        exit(1)

コード例 #47

0

ファイルを表示

ファイル: lagou.py プロジェクト: oychao1988/myScrapy

#             if status_code == 200:
#                 break
#             else:
#                 proxy = 'https://%s' % ip_port
#         else:
#             print(os.getpid(), 'all down')
#             continue
#         print(os.getpid(), 'ok:', proxy)
#         break

if __name__ == '__main__':
    # company_link = 'https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=3893733)
    # response = lagou_detail(company_link)
    # print(response.text)
    # lagou()
    # 183.233.89.222
    from multiprocessing import Pool
    from multiprocessing import Process
    from multiprocessing import JoinableQueue

    queue = JoinableQueue()
    # pool = Pool(processes=4)
    for i in range(4):
        Process(target=proxy_pool, args=(queue, )).start()
        # pool.apply_async(proxy_pool, args=(queue,))
    # pool.close()
    # pool.join()
    while True:
        print('queue =', queue.qsize())
        time.sleep(5)
    print('ending')

コード例 #48

0

ファイルを表示

ファイル: datasets.py プロジェクト: parita/flownet2-pytorch

    def _parallelly_make_dataset(self):
        import multiprocessing
        from multiprocessing import Process
        from multiprocessing import JoinableQueue as Queue

        name_file = '{}/video_list.npy'.format(self.loc)
        len_file = '{}/video_lengths.npy'.format(self.loc)

        if isfile(name_file):
            video_list = np.load(name_file)
            video_lengths = np.load(len_file)
            return video_list, video_lengths

        q = Queue()
        qvideo_list = Queue()

        fnames_list = []
        for root, _, fnames in tqdm(os.walk(self.root)):
            for fname in sorted(fnames):
                fnames_list.append(os.path.join(root, fname))

        def parallel_worker(fnames_chunk):
            item = q.get()
            for fname in tqdm(fnames_chunk):
                if has_file_allowed_extension(fname, VIDEO_EXTENSION):
                    video_path = fname
                    vc = cv2.VideoCapture(video_path)
                    length = int(vc.get(cv2.CAP_PROP_FRAME_COUNT))
                    if length > 0 and vc.isOpened():
                        qvideo_list.put((video_path, length))
                        qvideo_list.task_done()
                    vc.release()
            q.task_done()

        processes = 32
        n = len(fnames_list)
        chunk = int(n / processes)
        if chunk == 0:
            chunk = 1
        fnames_chunks = [fnames_list[i*chunk:(i+1)*chunk] \
                        for i in range((n + chunk - 1) // chunk)]
        for i in range(processes):
            q.put(i)
            multiprocessing.Process(target=parallel_worker,
                                    args=(fnames_chunks[i], )).start()

        q.join()
        qvideo_list.join()

        video_list = []
        video_lengths = []

        while qvideo_list.qsize() != 0:
            video, length = qvideo_list.get()
            video_list.append(video)
            video_lengths.append(length)

        np.save(name_file, video_list)
        np.save(len_file, video_lengths)

        return video_list, video_lengths

コード例 #49

0

ファイルを表示

ファイル: create_summary_sentences.py プロジェクト: Winter-Guerra/6.806_nlp_cancer_research

    print("Getting list of connected research articles")
    article_PMIDs = r.smembers('linked_articles')
    article_URLs = r.mget(['{0}:URL'.format(PMID) for PMID in article_PMIDs])

    uncatalogged = Queue()
    for args in zip(article_URLs, article_PMIDs):
        uncatalogged.put(args)

    if DEBUG:
        # Run one process
        print("DEBUG: Running single threaded.")
        parallel_worker()

    else:

        print("Starting pool")
        NUM_WORKERS = 7
        pool = Pool(NUM_WORKERS)

        results = [pool.apply_async(parallel_worker) for i in range(NUM_WORKERS)]

        print("Running progress capture.")
        while (True):
          remaining = uncatalogged.qsize()
          print "Waiting for", remaining, "tasks to complete..."
          time.sleep(0.5)
        #   print [result.get() for result in results]

        uncatalogged.join()
        print 'Done'

コード例 #50

0

ファイルを表示

conn.row_factory = sqlite3.Row

vids = load_vendor_ids('vendorids.txt')
#vids = {'04f9':1, '0424':1}

vids_done = {}  #get_completed_vids()

request_queue = JoinableQueue()
result_queue = JoinableQueue()

[
    request_queue.put((vid, prepare_driver_req(vid))) for vid in vids
    if vid not in vids_done
]
print 'total vids: %d, vids done: %d, vids remaining: %d' % (
    len(vids), len(vids_done), request_queue.qsize())

pool = Pool(4, request_worker, (request_queue, ))
pool.close()

while True:
    try:
        vid, current_page, total_pages, drivers = result_queue.get(True)
    except Exception as e:
        traceback.print_exc()
        break

    if drivers:
        print 'saving %s drivers for %s' % (len(drivers), vid)
        save_drivers(drivers, vid)
    else:

コード例 #51

0

ファイルを表示

ファイル: simil_crawler.py プロジェクト: zzzgoda/ECommerceCrawlers

class Crawler():
    def __init__(self):
        self.db = Mongo()
        self.cdb = DbClient()
        self.page = None
        self.session = None
        self.set_session()
        self.search_url_Queue = JoinableQueue()

    def set_session(self):
        s = requests.session()
        s.cookies.update(self.get_cookie())
        s.headers.update(HEADERS)
        self.session = s

    def get_cookie(self):  # 获取不为空的cookie
        while True:
            q = self.cdb.get_cookies(flag=1)
            if q == None:
                print('时间等待')
                time.sleep(10)
                continue
            else:
                d = {}
                if q:
                    self.user = q['user']
                    cookies = q['cookies']
                    for cookie in cookies:
                        d[cookie.get('name')] = cookie.get('value')
                    return d

    def get_page(self, url):
        url = url
        #r = self.session.get(url, headers=HEADERS, cookies=self.get_cookie())
        r = self.session.get(url, timeout=(10, 15))
        if r.text.find('亲，小二正忙，滑动一下马上回来') > 0:
            print("cookie需要验证!!!")
            self.cdb.update_cookie_flag2(self.user)
            return False
        if r.text.find('请输入') > 0:
            print("Need Login!!!")
            self.cdb.update_cookie_flag0(self.user)
            return False
        self.page = r.text
        self.parse()
        time.sleep(4)
        return True

    def parse(self):
        pattern = re.compile(r'g_page_config = ({.*});')
        m = re.search(pattern, self.page)
        if not m:
            print('Cannot fount data in this page.')
            with open('log_page.txt', 'w') as f:
                f.write(self.page)
            return False
        g_page_config = json.loads(m.group(1))
        auctions = g_page_config['mods']['itemlist']['data']['auctions']
        for auction in auctions:
            try:
                simil_url_short = auction.get('i2iTags', {
                    "samestyle": '/'
                }).get('samestyle', {"url", '/'}).get('url', '')
            except Exception as e:
                simil_url_short = ''
            d = {}
            d['keyword'] = KEYWORD
            d['t_link'] = 'https:' + auction.get('detail_url', '/')
            d['title'] = auction.get('raw_title')
            d['price'] = auction.get('view_price')
            d['shop_name'] = auction.get('nick')
            d['sales_num'] = auction.get('view_sales',
                                         '0').replace('人收货',
                                                      '').replace('人付款', '')
            d['simil_url_short'] = simil_url_short
            d['flag'] = 0
            print(d.get('keyword'), d.get('title'), d.get('simil_url_short'))
            self.db.insert(d)

    def run_cry(self):
        while True:
            print('【{}实时展示需要-请求-的原商品-链接】', self.search_url_Queue.qsize())

            search_url = self.search_url_Queue.get()  # 获得搜寻数据
            print('Crawling page {}'.format(search_url))
            flag = self.get_page(url=search_url)
            self.search_url_Queue.task_done()

    def run(self):
        for i in range(1, 4):
            page = str(i * 44)
            url = 'https://s.taobao.com/search?q=' + KEYWORD + '&sort=sale-desc&s=' + page
            print('搜索的初始url', url)

            self.search_url_Queue.put(url)

            Thread_list = []
            for i in range(1):
                Tsearch_page = threading.Thread(target=self.run_cry, args=())
                Thread_list.append(Tsearch_page)

            for p in Thread_list:
                p.daemon = True
                p.start()

            for all in [
                    self.search_url_Queue,
                    self.parse_data_search_shop_Queue,
                    self.data_search_shop_Queue,
                    self.parse_data_simil_shop_Queue,
                    self.data_simil_shop_Queue,
            ]:
                all.join()

コード例 #52

0

ファイルを表示

ファイル: parallel_breaker.py プロジェクト: alexharkess/HiRise_July2015_GR

     histogram_merge_worker.start()

     if args.top:
          reader_procs = [ psutil.Process(reader.pid) for reader in readers ]
          worker_procs = [ psutil.Process(worker.pid) for worker in workers ]

     pair_buffer={}
     scaffold_count={}
#     while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0:
     while True:
          if args.debug: print("get")
          try:
               procid,scaffold,pairs = inq.get()
#               procid,scaffold,pairs = inq.get(True,10)
               #print("#got data:",procid,scaffold,len(pairs))
               print("#got data from inq:",procid,scaffold,len(pairs),inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),"q.size():",q.qsize(),file=sys.stderr,sep="\t")
               sys.stderr.flush()
               sys.stdout.flush()
          except Exception as e:
               print(e,file=sys.stderr)
               if args.top:
                    print("queue get timed out",[reader.cpu_percent() for reader in reader_procs],[worker.cpu_percent() for worker in worker_procs])
               #print("#timed out",inq.empty())
               print("#read from queue timed out:",inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),file=sys.stderr,sep="\t")
               sys.stderr.flush()
               continue
          if args.debug: print("got")
          if not scaffold in pair_buffer:
               pair_buffer[scaffold]=[]
          pair_buffer[scaffold] += pairs
          scaffold_count[scaffold] = scaffold_count.get(scaffold,0)+1