Example #1
0
def multi_mode(cli_parsed):
    dbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db')
    dbm.open_connection()
    if not cli_parsed.resume:
        dbm.initialize_db()
    dbm.save_options(cli_parsed)
    m = Manager()
    targets = m.Queue()
    lock = m.Lock()
    multi_counter = m.Value('i', 0)
    display = None

    def exitsig(*args):
        dbm.close()
        if current_process().name == 'MainProcess':
            print ''
            print 'Resume using ./EyeWitness.py --resume {0}'.format(
                cli_parsed.d + '/ew.db')
        os._exit(1)

    signal.signal(signal.SIGINT, exitsig)
    if cli_parsed.resume:
        pass
    else:
        url_list, rdp_list, vnc_list = target_creator(cli_parsed)
        if cli_parsed.web:
            for url in url_list:
                dbm.create_http_object(url, cli_parsed)
        for rdp in rdp_list:
            dbm.create_vnc_rdp_object('rdp', rdp, cli_parsed)
        for vnc in vnc_list:
            dbm.create_vnc_rdp_object('vnc', vnc, cli_parsed)

    if cli_parsed.web:
        if cli_parsed.web and not cli_parsed.show_selenium:
            display = Display(visible=0, size=(1920, 1080))
            display.start()

        multi_total = dbm.get_incomplete_http(targets)
        if multi_total > 0:
            if cli_parsed.resume:
                print 'Resuming Web Scan ({0} Hosts Remaining)'.format(
                    str(multi_total))
            else:
                print 'Starting Web Requests ({0} Hosts)'.format(
                    str(multi_total))

        if multi_total < cli_parsed.threads:
            num_threads = multi_total
        else:
            num_threads = cli_parsed.threads
        for i in xrange(num_threads):
            targets.put(None)
        try:
            workers = [
                Process(target=worker_thread,
                        args=(cli_parsed, targets, lock, (multi_counter,
                                                          multi_total)))
                for i in xrange(num_threads)
            ]
            for w in workers:
                w.start()
            for w in workers:
                w.join()
        except Exception as e:
            print str(e)

        # Set up UA table here
        if cli_parsed.cycle is not None:
            ua_dict = get_ua_values(cli_parsed.cycle)
            if not cli_parsed.ua_init:
                dbm.clear_table("ua")
                completed = dbm.get_complete_http()
                completed[:] = [x for x in completed if x.error_state is None]
                for item in completed:
                    for browser, ua in ua_dict.iteritems():
                        dbm.create_ua_object(item, browser, ua)

                cli_parsed.ua_init = True
                dbm.clear_table("opts")
                dbm.save_options(cli_parsed)

            for browser, ua in ua_dict.iteritems():
                targets = m.Queue()
                multi_counter.value = 0
                multi_total = dbm.get_incomplete_ua(targets, browser)
                if multi_total > 0:
                    print(
                        "[*] Starting requests for User Agent {0}"
                        " ({1} Hosts)").format(browser, str(multi_total))
                if multi_total < cli_parsed.threads:
                    num_threads = multi_total
                else:
                    num_threads = cli_parsed.threads
                for i in xrange(num_threads):
                    targets.put(None)
                workers = [
                    Process(target=worker_thread,
                            args=(cli_parsed, targets, lock,
                                  (multi_counter, multi_total), (browser, ua)))
                    for i in xrange(num_threads)
                ]
                for w in workers:
                    w.start()
                for w in workers:
                    w.join()

    if any((cli_parsed.vnc, cli_parsed.rdp)):
        log._LOG_LEVEL = log.Level.ERROR
        multi_total, targets = dbm.get_incomplete_vnc_rdp()
        if multi_total > 0:
            print ''
            print 'Starting VNC/RDP Requests ({0} Hosts)'.format(
                str(multi_total))

            app = QtGui.QApplication(sys.argv)
            timer = QTimer()
            timer.start(10)
            timer.timeout.connect(lambda: None)

            # add qt4 reactor
            import qt4reactor
            qt4reactor.install()
            from twisted.internet import reactor

            for target in targets:
                if os.path.dirname(cli_parsed.d) != os.path.dirname(
                        target.screenshot_path):
                    target.set_paths(cli_parsed.d)
                tdbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db')
                if target.proto == 'vnc':
                    reactor.connectTCP(
                        target.remote_system, target.port,
                        vnc_module.RFBScreenShotFactory(
                            target.screenshot_path, reactor, app, target,
                            tdbm))
                else:
                    reactor.connectTCP(
                        target.remote_system, int(target.port),
                        rdp_module.RDPScreenShotFactory(
                            reactor, app, 1200, 800, target.screenshot_path,
                            cli_parsed.timeout, target, tdbm))
            reactor.runReturn()
            app.exec_()

    if display is not None:
        display.stop()
    results = dbm.get_complete_http()
    vnc_rdp = dbm.get_complete_vnc_rdp()
    dbm.close()
    m.shutdown()
    write_vnc_rdp_data(cli_parsed, vnc_rdp)
    sort_data_and_write(cli_parsed, results)
    if cli_parsed.ocr:
        for target in targets:
            try:
                rdp_module.parse_screenshot(cli_parsed.d, target)
            except IOError:
                pass
Example #2
0
    s = SessionWrapper.new(init=True)
    res = s.query(PullRequest.slug).distinct()
    for r in res:
        seen.add(r.slug)
    return seen


if __name__ == '__main__':
    pr_file = 'tmp_pullrequests.csv'
    # comment_file = 'tmp_comments.csv'
    logger = logging_config.get_logger('pr_extractor')
    try:
        tokens = Tokens()
        tokens_iter = tokens.iterator()
        manager = Manager()
        tokens_queue = manager.Queue()
        for token in tokens_iter:
            tokens_queue.put(token)
        tokens_map = manager.dict()

        extractor = PrAndCommentExtractor(tokens, tokens_queue, tokens_map)
        print("Retrieving the list of cloned GitHub project")
        slugs = get_github_slugs(sys.argv[1])
        print("%s" % len(slugs))
        print("Retrieving the list of project already analyzed")
        extractor.seen = get_already_parsed_projects()
        print("%s" % len(extractor.seen))
        print("Beginning data extraction")
        extractor.start(slugs, pr_file)
        print("Storing data into db")
        extractor.add_to_db(pr_file)
Example #3
0
class MultiprocessingManager:
    """The facade class for the Holmes library used in a multiprocessing environment.
        This class is threadsafe.

    Parameters:

    model -- the name of the spaCy model, e.g. *en_core_web_lg*
    overall_similarity_threshold -- the overall similarity threshold for embedding-based
        matching. Defaults to *1.0*, which deactivates embedding-based matching.
    embedding_based_matching_on_root_words -- determines whether or not embedding-based
        matching should be attempted on root (parent) tokens, which has a considerable
        performance hit. Defaults to *False*.
    ontology -- an *Ontology* object. Defaults to *None* (no ontology).
    analyze_derivational_morphology -- *True* if matching should be attempted between different
        words from the same word family. Defaults to *True*.
    perform_coreference_resolution -- *True*, *False* or *None* if coreference resolution
        should be performed depending on whether the model supports it. Defaults to *None*.
    debug -- a boolean value specifying whether debug representations should be outputted
        for parsed sentences. Defaults to *False*.
    verbose -- a boolean value specifying whether status messages should be outputted to the
        console. Defaults to *True*
    number_of_workers -- the number of worker processes to use, or *None* if the number of worker
        processes should depend on the number of available cores. Defaults to *None*
    """
    def __init__(self,
                 model,
                 *,
                 overall_similarity_threshold=1.0,
                 embedding_based_matching_on_root_words=False,
                 ontology=None,
                 analyze_derivational_morphology=True,
                 perform_coreference_resolution=None,
                 debug=False,
                 verbose=True,
                 number_of_workers=None):
        self.semantic_analyzer = SemanticAnalyzerFactory().semantic_analyzer(
            model=model,
            perform_coreference_resolution=perform_coreference_resolution,
            debug=debug)
        if perform_coreference_resolution is None:
            perform_coreference_resolution = \
                self.semantic_analyzer.model_supports_coreference_resolution()
        validate_options(self.semantic_analyzer, overall_similarity_threshold,
                         embedding_based_matching_on_root_words,
                         perform_coreference_resolution)
        self.structural_matcher = StructuralMatcher(
            self.semantic_analyzer, ontology, overall_similarity_threshold,
            embedding_based_matching_on_root_words,
            analyze_derivational_morphology, perform_coreference_resolution)
        self._perform_coreference_resolution = perform_coreference_resolution

        self._verbose = verbose
        self._document_labels = []
        self._input_queues = []
        if number_of_workers is None:
            number_of_workers = cpu_count()
        self._number_of_workers = number_of_workers
        self._next_worker_to_use = 0
        self._multiprocessor_manager = Multiprocessing_manager()
        self._worker = Worker(
        )  # will be copied to worker processes by value (Windows) or
        # by reference (Linux)
        self._workers = []
        for counter in range(0, self._number_of_workers):
            input_queue = Queue()
            self._input_queues.append(input_queue)
            worker_label = ' '.join(('Worker', str(counter)))
            this_worker = Process(target=self._worker.listen,
                                  args=(self.semantic_analyzer,
                                        self.structural_matcher, input_queue,
                                        worker_label),
                                  daemon=True)
            self._workers.append(this_worker)
            this_worker.start()
        self._lock = Lock()

    def _add_document_label(self, label):
        with self._lock:
            if label in self._document_labels:
                raise DuplicateDocumentError(label)
            else:
                self._document_labels.append(label)

    def _handle_reply(self, worker_label, return_value):
        """ If 'return_value' is an exception, return it, otherwise return 'None'. """
        if isinstance(return_value, Exception):
            return return_value
        elif self._verbose:
            if not isinstance(return_value, list):
                with self._lock:
                    print(': '.join((worker_label, return_value)))
            return None

    def _internal_register_documents(self, dictionary, worker_method):
        reply_queue = self._multiprocessor_manager.Queue()
        for label, value in dictionary.items():
            self._add_document_label(label)
            with self._lock:
                self._input_queues[self._next_worker_to_use].put(
                    (worker_method, (value, label), reply_queue))
                self._next_worker_to_use += 1
                if self._next_worker_to_use == self._number_of_workers:
                    self._next_worker_to_use = 0
        recorded_exception = None
        for _ in range(0, len(dictionary)):
            possible_exception = self._handle_reply(*reply_queue.get())
            if possible_exception is not None and recorded_exception is None:
                recorded_exception = possible_exception
        if recorded_exception is not None:
            with self._lock:
                print('ERROR: not all documents were registered successfully. Please examine the '\
                ' above output from the worker processes to identify the problem.')

    def parse_and_register_documents(self, document_dictionary):
        """Parameters:

        document_dictionary -- a dictionary from unique document labels to raw document texts.
        """
        self._internal_register_documents(
            document_dictionary,
            self._worker.worker_parse_and_register_document)

    def deserialize_and_register_documents(self,
                                           serialized_document_dictionary):
        """Parameters:

        serialized_document_dictionary -- a dictionary from unique document labels to
        documents serialized using the *Manager.serialize_document()* method.
        """
        if self._perform_coreference_resolution:
            raise SerializationNotSupportedError(self.semantic_analyzer.model)
        self._internal_register_documents(
            serialized_document_dictionary,
            self._worker.worker_deserialize_and_register_document)

    def document_labels(self):
        with self._lock:
            document_labels = self._document_labels
        return sorted(document_labels)

    def topic_match_documents_returning_dictionaries_against(
            self,
            text_to_match,
            *,
            maximum_activation_distance=75,
            relation_score=30,
            reverse_only_relation_score=20,
            single_word_score=5,
            single_word_any_tag_score=2,
            overlapping_relation_multiplier=1.5,
            embedding_penalty=0.6,
            ontology_penalty=0.9,
            maximum_number_of_single_word_matches_for_relation_matching=500,
            maximum_number_of_single_word_matches_for_embedding_matching=100,
            sideways_match_extent=100,
            only_one_result_per_document=False,
            number_of_results=10,
            document_label_filter=None,
            tied_result_quotient=0.9):
        """Returns the results of a topic match between an entered text and the loaded documents.

        Properties:

        text_to_match -- the text to match against the loaded documents.
        maximum_activation_distance -- the number of words it takes for a previous phraselet
            activation to reduce to zero when the library is reading through a document.
        relation_score -- the activation score added when a normal two-word
            relation is matched.
        reverse_only_relation_score -- the activation score added when a two-word relation
                is matched using a search phrase that can only be reverse-matched.
        single_word_score -- the activation score added when a normal single
            word is matched.
        single_word_any_tag_score -- the activation score added when a single word is matched
            whose tag did not correspond to the template specification.
        overlapping_relation_multiplier -- the value by which the activation score is multiplied
            when two relations were matched and the matches involved a common document word.
        embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
            match involved an embedding. The result is additionally multiplied by the overall
            similarity measure of the match.
        ontology_penalty -- a value between 0 and 1 with which scores are multiplied for each
            word match within a match that involved the ontology. For each such word match,
            the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
            higher for hyponyms and hypernyms than for synonyms and increases with the
            depth distance.
        maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
                of single word matches that are used as the basis for matching relations. If more
                document words than this value correspond to each of the two words within a
                relation phraselet, matching on the phraselet is not attempted.
        maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
                of single word matches that are used as the basis for reverse matching with
                embeddings at the parent word. If more than this value exist, reverse matching with
                embeddings is not attempted because the performance hit would be too great.
        sideways_match_extent -- the maximum number of words that may be incorporated into a
            topic match either side of the word where the activation peaked.
        only_one_result_per_document -- if 'True', prevents multiple results from being returned
            for the same document.
        number_of_results -- the number of topic match objects to return.
        document_label_filter -- optionally, a string with which document labels must start to
            be considered for inclusion in the results.
        tied_result_quotient -- the quotient between a result and following results above which
            the results are interpreted as tied.
        """
        if maximum_number_of_single_word_matches_for_embedding_matching > \
                maximum_number_of_single_word_matches_for_relation_matching:
            raise EmbeddingThresholdGreaterThanRelationThresholdError(' '.join((
                'embedding',
                str(maximum_number_of_single_word_matches_for_embedding_matching
                    ), 'relation',
                str(maximum_number_of_single_word_matches_for_relation_matching
                    ))))
        reply_queue = self._multiprocessor_manager.Queue()
        for counter in range(0, self._number_of_workers):
            self._input_queues[counter].put(
                (self._worker.
                 worker_topic_match_documents_returning_dictionaries_against,
                 (text_to_match, maximum_activation_distance, relation_score,
                  reverse_only_relation_score, single_word_score,
                  single_word_any_tag_score, overlapping_relation_multiplier,
                  embedding_penalty, ontology_penalty,
                  maximum_number_of_single_word_matches_for_relation_matching,
                  maximum_number_of_single_word_matches_for_embedding_matching,
                  sideways_match_extent, only_one_result_per_document,
                  number_of_results, document_label_filter,
                  tied_result_quotient), reply_queue))
        topic_match_dicts = []
        recorded_exception = None
        for _ in range(0, self._number_of_workers):
            worker_label, worker_topic_match_dicts = reply_queue.get()
            if recorded_exception is None:
                recorded_exception = self._handle_reply(
                    worker_label, worker_topic_match_dicts)
            if not isinstance(worker_topic_match_dicts, Exception):
                topic_match_dicts.extend(worker_topic_match_dicts)
        if recorded_exception is not None:
            with self._lock:
                print('ERROR: not all workers returned results. Please examine the above output '\
                ' from the worker processes to identify the problem.')
        return TopicMatchDictionaryOrderer().order(topic_match_dicts,
                                                   number_of_results,
                                                   tied_result_quotient)

    def start_topic_matching_search_mode_console(
            self,
            only_one_result_per_document=False,
            maximum_number_of_single_word_matches_for_relation_matching=500,
            maximum_number_of_single_word_matches_for_embedding_matching=100):
        """Starts a topic matching search mode console enabling the matching of pre-registered
            documents to search texts entered ad-hoc by the user.

            Parameters:

            only_one_result_per_document -- if 'True', prevents multiple topic match
                results from being returned for the same document.
            maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
                of single word matches that are used as the basis for matching relations. If more
                document words than this value correspond to each of the two words within a
                relation phraselet, matching on the phraselet is not attempted.
            maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
              of single word matches that are used as the basis for matching with
              embeddings at the other word. If more than this value exist, matching with
              embeddings is not attempted because the performance hit would be too great.
        """
        holmes_consoles = HolmesConsoles(self)
        holmes_consoles.start_topic_matching_search_mode(
            only_one_result_per_document,
            maximum_number_of_single_word_matches_for_relation_matching=
            maximum_number_of_single_word_matches_for_relation_matching,
            maximum_number_of_single_word_matches_for_embedding_matching=
            maximum_number_of_single_word_matches_for_embedding_matching)

    def close(self):
        for worker in self._workers:
            worker.terminate()
Example #4
0
def main(input_file, output_file, temp_dir=None, parallel=True):
    """ This module filters RFs according to input options and then computes some quality metrics on each RF.
        This enables different downstream approaches to selecting and filtering for good quality RFs.

        The stats attribute of each RF is populated with these quality metrics. In addition, a new root group
        is added to the hdf5 file containing a Pandas DataFrame that tabulates the attributes of each trace
        to allow easy event filtering in the downstream workflow.

    Available methods:
    1. rf_group_by_similarity - grouping method based on calculation of euclidean distances and clustering by
       similarity ( aca machine learning approach)
    2. TODO: coherence - finding the coherent signals (in frequency domain) relative to median. Consequently, moveout
       should be applied to use this technique
    3. TODO knive - analysing the change of RMS relative to median. Noisy stations will give higher input. Moveout
       should be applied to use this technique
    4. S/N ratio
    5. Spectral entropy
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    similarity_eps = 0.05

    # Set up asynchronous buffered writing of results to file
    mgr = Manager()
    with h5py.File(input_file, mode='r') as h5f:
        config_str = h5f.attrs['metadata'] if 'metadata' in h5f.attrs else ''
    write_queue = mgr.Queue()
    output_thread = Process(target=async_write,
                            args=(write_queue, output_file, 20, config_str))
    output_thread.daemon = True
    output_thread.start()

    logger.info("Processing source file {}".format(input_file))
    if parallel:
        logger.info("Parallel processing")
        Parallel(n_jobs=-3, verbose=5, max_nbytes='16M', temp_folder=temp_dir) \
            (delayed(rf_quality_metrics_queue)(write_queue, station_id, station_stream3c, similarity_eps)
             for station_id, station_stream3c in IterRfH5StationEvents(input_file))
    else:
        logger.info("Serial processing")
        for station_id, station_stream3c in IterRfH5StationEvents(input_file):
            try:
                rf_quality_metrics_queue(write_queue, station_id,
                                         station_stream3c, similarity_eps)
            except (ValueError, AssertionError) as e:
                traceback.print_exc()
                logger.error(
                    "Unhandled exception occurred in rf_quality_metrics_queue for station {}. "
                    "Data will be omitted for this station!\nError:\n{}".
                    format(station_id, str(e)))
            # end try
        # end for
    # end if

    # Signal completion
    logger.info("Finishing...")
    write_queue.put(None)
    write_queue.join()

    logger.info("rf_quality_filter SUCCESS!")
Example #5
0
                          abnormal_fraction)


# Collections of all valid algorithms.
__ALGO_NAMES__ = [
    '{}-{}'.format(algo, p) for algo in ('cae', 'cae-iforest', 'drae', 'rdae',
                                         'dagmm', 'ssd-iforest', 'e3outlier')
    for p in (0.05, 0.1, 0.15, 0.2, 0.25)
]

if __name__ == '__main__':

    n_run = 5
    N_GPUS = 1  # deprecated, use one gpu only
    man = Manager()
    q = man.Queue(N_GPUS)
    for g in range(N_GPUS):
        q.put(str(g))

    experiments_list = [
        (load_mnist_with_outliers, 'mnist', 10),
        (load_fashion_mnist_with_outliers, 'fashion-mnist', 10),
        (load_cifar10_with_outliers, 'cifar10', 10),
        (load_cifar100_with_outliers, 'cifar100', 20),
        (load_svhn_with_outliers, 'svhn', 10),
    ]

    p_list = [0.05, 0.1, 0.15, 0.2, 0.25]
    for i in range(n_run):
        for data_load_fn, dataset_name, n_classes in experiments_list:
            for p in p_list:
Example #6
0
def evaluate_conv_net(storm_norm_data,
                      storm_meta,
                      hail_labels,
                      sampling_config,
                      param_combos,
                      out_path,
                      num_gpus=8):
    """

    Args:
        storm_norm_data:
        storm_meta:
        hail_labels:
        sampling_config:
        param_combos:
        out_path:
        num_gpus:
    Returns:

    """
    unique_dates = np.unique(storm_meta["run_dates"])
    np.random.seed(sampling_config["random_seed"])
    storm_sampler = train_split_generator(unique_dates,
                                          sampling_config["train_split"],
                                          sampling_config["num_samples"])
    best_param_combos = []
    sample_scores = pd.DataFrame(
        index=np.arange(sampling_config["num_samples"]),
        columns=[
            "Brier Score", "Brier Score Climo", "Brier Skill Score", "AUC"
        ],
        dtype=float)
    for n in range(sampling_config["num_samples"]):
        environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
        train_dates, test_dates = next(storm_sampler)
        print(train_dates, test_dates)
        train_indices = np.where(np.in1d(storm_meta["run_dates"],
                                         train_dates))[0]
        test_indices = np.where(np.in1d(storm_meta["run_dates"],
                                        test_dates))[0]
        all_members = np.unique(storm_meta.loc[train_indices, "members"])
        np.random.shuffle(all_members)
        member_split = int(
            np.round(all_members.size * sampling_config["member_split"]))
        train_members = all_members[:member_split]
        val_members = all_members[member_split:]
        print(train_members, val_members)
        train_member_indices = np.where(
            np.in1d(storm_meta.loc[train_indices, "members"],
                    train_members))[0]
        val_member_indices = np.where(
            np.in1d(storm_meta.loc[train_indices, "members"], val_members))[0]
        param_scores = pd.DataFrame(index=np.arange(param_combos.shape[0]),
                                    columns=["Brier Skill Score", "AUC"],
                                    dtype=float)
        score_outputs = []

        param_train_data = storm_norm_data[train_indices][train_member_indices]
        param_train_labels = hail_labels[train_indices][train_member_indices]
        param_val_data = storm_norm_data[train_indices][val_member_indices]
        param_val_labels = hail_labels[train_indices][val_member_indices]
        print("Saving training data")
        np.save(join(out_path, "param_train_data.npy"), param_train_data)
        np.save(join(out_path, "param_train_labels.npy"), param_train_labels)
        np.save(join(out_path, "param_val_data.npy"), param_val_data)
        np.save(join(out_path, "param_val_labels.npy"), param_val_labels)
        gpu_manager = Manager()
        gpu_queue = gpu_manager.Queue()
        n_pool = Pool(num_gpus, maxtasksperchild=1)
        for g in range(num_gpus):
            gpu_queue.put(g)

        for c in param_combos.index.values:
            print(c)
            score_outputs.append(
                n_pool.apply_async(
                    train_single_conv_net,
                    (c, gpu_queue, param_combos.loc[c].to_dict(), out_path)))
        n_pool.close()
        n_pool.join()
        #for c in param_combos.index.values:
        #    score_outputs.append(train_single_conv_net(c, gpu_queue, param_combos.loc[c].to_dict(), out_path))
        for async_out in score_outputs:
            out = async_out.get()
            param_scores.loc[out[1]] = out[0]
        del n_pool
        del gpu_queue
        del gpu_manager
        best_config = param_scores["Brier Skill Score"].idxmax()
        best_combo = param_combos.loc[best_config].to_dict()
        param_scores.to_csv(join(
            out_path, "conv_net_param_scores_sample_{0:03d}.csv".format(n)),
                            index_label="Param Combo")
        best_param_combos.append(best_config)
        print("Best Config")
        print(param_combos.loc[best_config])
        pool = Pool(1)
        np.save(join(out_path, "best_train_data.npy"),
                storm_norm_data[train_indices])
        np.save(join(out_path, "best_test_data.npy"),
                storm_norm_data[test_indices])
        sample_scores = pool.apply(
            train_best_conv_net,
            (best_combo, n, hail_labels[train_indices],
             storm_meta.loc[test_indices], hail_labels[test_indices],
             sample_scores, out_path))
        pool.close()
        pool.join()
        del pool
        sample_scores.to_csv(join(out_path, "conv_net_sample_scores.csv"),
                             index_label="Sample")
    best_config_frame = param_combos.loc[best_param_combos]
    best_config_frame = best_config_frame.reset_index()
    best_config_frame.to_csv(join(out_path, "conv_net_best_params.csv"),
                             index_label="Sample")
    return
            all.append(price.text)
        else:
            all.append('R$0')

        if rating is not None:
            all.append(rating.text)
        else:
            all.append('-1')
        q.put(all)
        #print("---------------------------------------------------------------")


results = []
if __name__ == "__main__":
    m = Manager()
    q = m.Queue(
    )  # use this manager Queue instead of multiprocessing Queue as that causes error
    p = {}
    if sys.argv[1] in [
            't', 'p'
    ]:  # user decides which method to invoke: thread, process or pool
        for i in range(1, no_pages):
            if sys.argv[1] in ['t']:
                print("starting thread: ", i)
                p[i] = threading.Thread(target=get_data, args=(i, q))
                p[i].start()
            elif sys.argv[1] in ['p']:
                print("starting process: ", i)
                p[i] = Process(target=get_data, args=(i, q))
                p[i].start()
        # join should be done in seperate for loop
        # reason being that once we join within previous for loop, join for p1 will start working
Example #8
0
 def __init__(self, cache_fname='.sparql_cache'):
     self.cache_fname = cache_fname
     self.cache_has_changed = False
     manager = Manager()
     self.cache_queue = manager.Queue()
     self.cache = TryLoad(self.cache_fname)
    assert sum(args.split) == 1 and not any(
        [i < 0 or i > 1 for i in args.split]), "Split must be valid distrib"

    traj_files = []
    for s in args.paths.split(':'):
        if 'traj_group' in s:
            traj_files = traj_files + glob.glob('{}/traj*'.format(s))
        else:
            for t_group in glob.glob('{}/traj_group*'.format(s)):
                traj_files = traj_files + glob.glob('{}/traj*'.format(t_group))
    random.shuffle(traj_files)

    print('Saving {} trajectories...'.format(len(traj_files)))

    m = Manager()
    record_queue = m.Queue()
    save_dir, T = args.save_dir, args.T
    seperate_good, traj_per_file = args.seperate_good, args.traj_per_file
    record_saver_proc = Process(target=record_worker,
                                args=(record_queue, save_dir, T, seperate_good,
                                      traj_per_file, args.offset,
                                      tuple(args.split)))
    record_saver_proc.start()

    if args.nworkers > 1:
        confs = []
        split = len(traj_files) // args.nworkers
        for w in range(args.nworkers):
            start, end = w * split, (w + 1) * split
            if w == args.nworkers - 1:
                end = len(traj_files)
Example #10
0
LOGS_PATH = f'{os.getcwd()}/'
logger = get_neat_logger(path=LOGS_PATH)

# N_SAMPLES = 10
N_PROCESSES = 16
N_GENOMES = 5

genomes = []
for i in range(N_GENOMES):
    genome = Genome(key=i)
    genome.create_random_genome()
    genomes.append(genome)


manager = Manager()
task_queue = manager.Queue()
exit_queue = manager.Queue()
exception_queue = manager.Queue()
results_queue = manager.Queue()


workers = []
for i in range(N_PROCESSES):
    worker = Worker(task_queue=task_queue,
                    exit_queue=exit_queue,
                    exception_queue=exception_queue,
                    results_queue=results_queue)
    worker.start()
    workers.append(worker)

for genome in genomes:
    lock = Lock()

    total_lines = args.num_lines[0]
    num_processes = args.num_processes[0]
    group_size = num_processes
    total_groups = total_lines // group_size

    process_labels = []
    process_queues = []
    process = []
    pipes = []

    for i in range(0, num_processes):
        label = "process_  " + str(i) + "  : "
        process_labels.append(label)
        process_queues.append(the_man.Queue())
        send_end, recv_end = Pipe()
        pipes.append(recv_end)
        process.append(
            Process(target=pool_process_paragraph,
                    args=(
                        process_queues[i],
                        t1,
                        process_labels[i],
                        args.intermediate_text_address[0],
                        total_groups,
                        send_end,
                        lock,
                        num_processes,
                        i,
                    )))
Example #12
0
                if not result['OK']:
                    print("Failed queueing %s" % path)
    else:
        print("Task failed: %s" % result['Message'])
        if 'Path' in result:
            random.shuffle(lfcHosts)
            print("Requeueing task for directory %s, lfc %s" %
                  (result['Path'], lfcHosts[0]))


#########################################################################

pPool = ProcessPool(30, 40, 0)

manager = Manager()
writerQueue = manager.Queue()
stopFlag = Value('i', 0)

# pPool.daemonize()

# lfcHosts = ['lfc-lhcb-ro.cern.ch',
#             'lfc-lhcb-ro.cr.cnaf.infn.it',
#             'lhcb-lfc-fzk.gridka.de',
#             'lfc-lhcb-ro.in2p3.fr',
#             'lfc-lhcb.grid.sara.nl',
#             'lfclhcb.pic.es',
#             'lhcb-lfc.gridpp.rl.ac.uk']
lfcHosts = ['prod-lfc-lhcb-ro.cern.ch']

# path = "/lhcb/LHCb"
path = '/lhcb/user/c/chaen'
Example #13
0
        except EmptyException:
            pass        # Resume normal execution.

def sleep_process(secs, queue):
    """Sleep for a certain amount of time, and update state when finished.

    Arguments:
        secs {int} -- number of seconds to sleep for.
        queue {Queue} -- shared Queue used for interprocess communication
                         with run_apps.
    """

    sleep(secs)
    queue.put(True, timeout=0.001)      # Low timeout to prevent blocking.

if __name__ == '__main__':
    app_list = []
    for app_name in APPS:
        module = import_module(APPS.get(app_name))
        app = getattr(module, app_name)
        app_list.append(app)
    print(app_list)


    manager = Manager()
    queue = manager.Queue()

    pool = Pool(processes=2)
    pool.apply(run_apps, args=(app_list, ARGS, KWARGS, queue))
    pool.apply_async(sleep_process, args=(5, queue))
            try:
                v[seqshuffle(pop)[0]] += 1
            except KeyboardInterrupt:
                pass
        now = time.time()
        q.put((n, pop, trials, v, now - start))

    POP = int(sys.argv[1]) if len(sys.argv) > 1 else 50
    RUNS = range(8, 24 + 1)

    print 'Usage: python %s <popsize>' % (sys.argv[0], )
    print 'popsize:', POP
    print 'n: %u..%u' % (RUNS[0], RUNS[-1])

    man = Manager()
    q = man.Queue()
    p = Pool()

    # blame http://zachseward.com/sparktweets/
    # http://en.wikipedia.org/wiki/List_of_Unicode_characters#Block_elements
    BLOCKS = u' _▁▂▃▄▅▆▇█'

    popsize = POP

    prevtrials = 0
    prevv = [0 for _ in range(popsize + 1)]

    try:
        for n in RUNS:

            # scatter: calculate the amount of work (trials), split it into a bunch of jobs and run
Example #15
0
    def _executors_repro(
            self,
            executors: dict,
            jobs: Optional[int] = 1) -> Mapping[str, Mapping[str, str]]:
        """Run dvc repro for the specified BaseExecutors in parallel.

        Returns:
            dict mapping stash revs to the successfully executed experiments
            for each stash rev.
        """
        result: Dict[str, Dict[str, str]] = defaultdict(dict)

        manager = Manager()
        pid_q = manager.Queue()

        rel_cwd = relpath(os.getcwd(), self.repo.root_dir)
        with ProcessPoolExecutor(max_workers=jobs) as workers:
            futures = {}
            for rev, executor in executors.items():
                future = workers.submit(
                    executor.reproduce,
                    executor.dvc_dir,
                    rev,
                    queue=pid_q,
                    name=executor.name,
                    rel_cwd=rel_cwd,
                    log_level=logger.getEffectiveLevel(),
                )
                futures[future] = (rev, executor)

            try:
                wait(futures)
            except KeyboardInterrupt:
                # forward SIGINT to any running executor processes and
                # cancel any remaining futures
                pids = {}
                while not pid_q.empty():
                    rev, pid = pid_q.get()
                    pids[rev] = pid
                for future, (rev, _) in futures.items():
                    if future.running():
                        os.kill(pids[rev], signal.SIGINT)
                    elif not future.done():
                        future.cancel()

            for future, (rev, executor) in futures.items():
                rev, executor = futures[future]
                exc = future.exception()

                try:
                    if exc is None:
                        exec_result = future.result()
                        result[rev].update(
                            self._collect_executor(executor, exec_result))
                    else:
                        # Checkpoint errors have already been logged
                        if not isinstance(exc, CheckpointKilledError):
                            logger.exception(
                                "Failed to reproduce experiment '%s'",
                                rev[:7],
                                exc_info=exc,
                            )
                except CancelledError:
                    logger.error(
                        "Cancelled before attempting to reproduce experiment "
                        "'%s'",
                        rev[:7],
                    )
                finally:
                    executor.cleanup()

        return result
Example #16
0
def run_tests_parallel(tests, prefix, options):
    # This queue will contain the results of the various tests run.
    # We could make this queue a global variable instead of using
    # a manager to share, but this will not work on Windows.
    queue_manager = Manager()
    async_test_result_queue = queue_manager.Queue()

    # This queue will be used by the result process to indicate
    # that it has received a result and we can start a new process
    # on our end. The advantage is that we don't have to sleep and
    # check for worker completion ourselves regularly.
    notify_queue = queue_manager.Queue()

    # This queue will contain the return value of the function
    # processing the test results.
    result_process_return_queue = queue_manager.Queue()
    result_process = Process(target=process_test_results_parallel,
                             args=(async_test_result_queue,
                                   result_process_return_queue, notify_queue,
                                   len(tests), options))
    result_process.start()

    # Ensure that a SIGTERM is handled the same way as SIGINT
    # to terminate all child processes.
    sigint_handler = signal.getsignal(signal.SIGINT)
    signal.signal(signal.SIGTERM, sigint_handler)

    worker_processes = []

    def remove_completed_workers(workers):
        new_workers = []
        for worker in workers:
            if worker.is_alive():
                new_workers.append(worker)
            else:
                worker.join()
        return new_workers

    try:
        testcnt = 0
        # Initially start as many jobs as allowed to run parallel
        for i in range(min(options.max_jobs, len(tests))):
            notify_queue.put(True)

        # For every item in the notify queue, start one new worker.
        # Every completed worker adds a new item to this queue.
        while notify_queue.get():
            if (testcnt < len(tests)):
                # Start one new worker
                worker_process = Process(target=wrap_parallel_run_test,
                                         args=(tests[testcnt], prefix,
                                               async_test_result_queue,
                                               options))
                worker_processes.append(worker_process)
                worker_process.start()
                testcnt += 1

                # Collect completed workers
                worker_processes = remove_completed_workers(worker_processes)
            else:
                break

        # Wait for all processes to terminate
        while len(worker_processes) > 0:
            worker_processes = remove_completed_workers(worker_processes)

        # Signal completion to result processor, then wait for it to complete on its own
        async_test_result_queue.put(None)
        result_process.join()

        # Return what the result process has returned to us
        return result_process_return_queue.get()
    except (Exception, KeyboardInterrupt) as e:
        # Print the exception if it's not an interrupt,
        # might point to a bug or other faulty condition
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()

        for worker in worker_processes:
            try:
                worker.terminate()
            except:
                pass

        result_process.terminate()

    return False
class MultiprocessRunner(SystemRunner):

    def __init__(self, system: System, pipeline_ids=(DEFAULT_PIPELINE_ID,), poll_interval=None,
                 setup_tables=False, sleep_for_setup_tables=0, *args, **kwargs):
        super(MultiprocessRunner, self).__init__(system=system, *args, **kwargs)
        self.pipeline_ids = pipeline_ids
        self.poll_interval = poll_interval or DEFAULT_POLL_INTERVAL
        assert isinstance(system, System)
        self.os_processes = None
        self.setup_tables = setup_tables or system.setup_tables
        self.sleep_for_setup_tables = sleep_for_setup_tables

    def start(self):
        assert self.os_processes is None, "Already started"

        self.os_processes = []

        self.manager = Manager()
        self.inboxes = {}
        self.outboxes = {}

        # Setup queues.
        for pipeline_id in self.pipeline_ids:
            for process_name, upstream_names in self.system.followings.items():
                inbox_id = (pipeline_id, process_name.lower())
                if inbox_id not in self.inboxes:
                    self.inboxes[inbox_id] = self.manager.Queue()
                for upstream_class_name in upstream_names:
                    outbox_id = (pipeline_id, upstream_class_name.lower())
                    if outbox_id not in self.outboxes:
                        self.outboxes[outbox_id] = PromptOutbox()
                    if inbox_id not in self.outboxes[outbox_id].downstream_inboxes:
                        self.outboxes[outbox_id].downstream_inboxes[inbox_id] = self.inboxes[inbox_id]

        # Subscribe to broadcast prompts published by a process
        # application in the parent operating system process.
        subscribe(handler=self.broadcast_prompt, predicate=self.is_prompt)

        # Start operating system process.
        for pipeline_id in self.pipeline_ids:
            for process_name, upstream_names in self.system.followings.items():
                process_class = self.system.process_classes[process_name]
                inbox = self.inboxes[(pipeline_id, process_name.lower())]
                outbox = self.outboxes.get((pipeline_id, process_name.lower()))
                os_process = OperatingSystemProcess(
                    application_process_class=process_class,
                    infrastructure_class=self.infrastructure_class,
                    upstream_names=upstream_names,
                    poll_interval=self.poll_interval,
                    pipeline_id=pipeline_id,
                    setup_tables=self.setup_tables,
                    inbox=inbox,
                    outbox=outbox,
                )
                os_process.daemon = True
                os_process.start()
                self.os_processes.append(os_process)
                if self.setup_tables:
                    # Avoid conflicts when creating tables.
                    sleep(self.sleep_for_setup_tables)

    def broadcast_prompt(self, prompt):
        outbox_id = (prompt.pipeline_id, prompt.process_name)
        outbox = self.outboxes.get(outbox_id)
        if outbox:
            outbox.put(prompt)

    @staticmethod
    def is_prompt(event):
        return isinstance(event, Prompt)

    def close(self):
        super(MultiprocessRunner, self).close()

        unsubscribe(handler=self.broadcast_prompt, predicate=self.is_prompt)

        for os_process in self.os_processes:
            os_process.inbox.put('QUIT')

        for os_process in self.os_processes:
            os_process.join(timeout=10)

        for os_process in self.os_processes:
            os_process.is_alive() and os_process.terminate()

        self.os_processes = None
        self.manager = None
Example #18
0
def custom_search(request):
	
	manager = Manager()
	q = manager.Queue()
	print q.qsize()
	start_time = timeit.default_timer()
	return_dict = manager.dict()
	if request.method == 'POST':
		data = json.loads(request.body)
		query = data['query']

		p1 = Process(target=search_google, args=(query,return_dict,q,"google process"))
    	p1.daemon = True
    	p1.name ="google process"
    	q.put(p1.name)
    	print p1

    	p2 = Process(target=search_twitter, args=(query,return_dict,q,"twitter process"))
    	p2.daemon = True
    	p2.name ="twitter process"
    	q.put(p2.name)
    	print p2

    	p3 = Process(target=search_duckgo, args=(query,return_dict,q,"duckduckgo process"))
    	p3.daemon = True
    	p3.name ="duckduckgo process"
    	q.put(p3.name)
    	print p3

    	print q.qsize()
    	  	
    	p1.start()
    	p2.start()
    	p3.start()
    	

    	print "before p1 join"
    	p1.join(1)
    	print "after p1 join"
    	p2.join(1)
    	print "after p2 join"
    	p3.join(1)
    	print "after p3 join"

    	# q.join()
    	print "after q.join()"
    	if p1.is_alive(): 
    		print "timed out1"
    		p1.terminate()
    		p1.join()
    		return_dict[p3.name]= "timed out"

    	if p2.is_alive(): 
    		print "timed out2"
    		p2.terminate()
    		p2.join()
    		return_dict[p3.name]= "timed out"

    	if p3.is_alive(): 
    		print "timed out3"
    		p3.terminate()
    		p3.join()
    		return_dict[p3.name]= "timed out"

    	print "after time outs"
    	end_time = timeit.default_timer()
    	time_taken = end_time - start_time
    	print "time taken: "+str(start_time)+" and "+ str(end_time)+" : "+str(time_taken)

    	print return_dict.values()

    	if time_taken > 1.0:
    		return HttpResponse(json.dumps(
    		{"query":query,
			"results": "error: time taken is greater than 1 sec"
			}))

    	else:
	    	google_data = return_dict["google process"]
	    	twitter_data = return_dict["twitter process"]
	    	duckgo_data = return_dict["duckduckgo process"]

	    	return HttpResponse(json.dumps(
	    		{"query":query,
				"results":{
					"google": google_data,
					"twitter": twitter_data,
					"duckduckgo": duckgo_data
				}
			}))
Example #19
0
def evaluate_sklearn_model(model_name,
                           model_obj,
                           storm_data,
                           storm_meta,
                           hail_labels,
                           sampling_config,
                           param_combos,
                           out_path,
                           num_gpus=8):
    unique_dates = np.unique(storm_meta["run_dates"])
    np.random.seed(sampling_config["random_seed"])
    storm_sampler = train_split_generator(unique_dates,
                                          sampling_config["train_split"],
                                          sampling_config["num_samples"])
    best_param_combos = []
    sample_scores = pd.DataFrame(
        index=np.arange(sampling_config["num_samples"]),
        columns=[
            "Brier Score", "Brier Score Climo", "Brier Skill Score", "AUC"
        ],
        dtype=float)
    for n in range(sampling_config["num_samples"]):
        train_dates, test_dates = next(storm_sampler)
        train_indices = np.where(np.in1d(storm_meta["run_dates"],
                                         train_dates))[0]
        test_indices = np.where(np.in1d(storm_meta["run_dates"],
                                        test_dates))[0]
        all_members = np.unique(storm_meta.loc[train_indices, "members"])
        np.random.shuffle(all_members)
        member_split = int(
            np.round(all_members.size * sampling_config["member_split"]))
        train_members = all_members[:member_split]
        val_members = all_members[member_split:]
        train_member_indices = np.where(
            np.in1d(storm_meta.loc[train_indices, "members"],
                    train_members))[0]
        val_member_indices = np.where(
            np.in1d(storm_meta.loc[train_indices, "members"], val_members))[0]
        param_scores = pd.DataFrame(index=np.arange(param_combos.shape[0]),
                                    columns=["Brier Skill Score", "AUC"],
                                    dtype=float)

        score_outputs = []
        param_train_data = storm_data[train_indices][train_member_indices]
        param_train_labels = hail_labels[train_indices][train_member_indices]
        param_val_data = storm_data[train_indices][val_member_indices]
        param_val_labels = hail_labels[train_indices][val_member_indices]
        print("Saving training data")
        np.save(join(out_path, "param_train_data.npy"), param_train_data)
        np.save(join(out_path, "param_train_labels.npy"), param_train_labels)
        np.save(join(out_path, "param_val_data.npy"), param_val_data)
        np.save(join(out_path, "param_val_labels.npy"), param_val_labels)
        gpu_manager = Manager()
        gpu_queue = gpu_manager.Queue()
        n_pool = Pool(num_gpus, maxtasksperchild=1)
        for g in range(num_gpus):
            gpu_queue.put(g)

        for c in param_combos.index.values:
            print(c)
            score_outputs.append(
                n_pool.apply_async(train_single_sklearn_model,
                                   (model_name, model_obj, c,
                                    param_combos.loc[c].to_dict(), out_path),
                                   dict(device_queue=gpu_queue)))
        n_pool.close()
        n_pool.join()
        for async_out in score_outputs:
            out = async_out.get()
            param_scores.loc[out[1]] = out[0]
        del n_pool
        del gpu_queue
        del gpu_manager
        #for c in param_combos.index:
        #    print(param_combos.loc[c])
        #    model_inst = model_obj(**param_combos.loc[c].to_dict())
        #    model_inst.fit(storm_data[train_indices][train_member_indices],
        #                   hail_labels[train_indices][train_member_indices])
        #    val_preds = model_inst.predict_proba(storm_data[train_indices][val_member_indices])[:, 1]
        #    param_scores.loc[c, "Brier Skill Score"] = brier_skill_score(hail_labels[train_indices][val_member_indices],
        #                                                                 val_preds)
        #    param_scores.loc[c, "AUC"] = roc_auc_score(hail_labels[train_indices][val_member_indices],
        #                                               val_preds)
        #    if param_scores.loc[c, "Brier Skill Score"] > best_score:
        #        best_config = c
        #        best_score = param_scores.loc[c, "Brier Skill Score"]
        #    del model_inst

        param_scores.to_csv(join(
            out_path,
            "{0}_param_scores_sample_{1:03d}.csv".format(model_name, n)),
                            index_label="Param Combo")
        best_config = param_scores["Brier Skill Score"].idxmax()
        best_combo = param_combos.loc[best_config].to_dict()
        best_param_combos.append(best_config)
        print("Best Config")
        print(param_combos.loc[best_config])
        pool = Pool(1)
        np.save(join(out_path, "best_train_data.npy"),
                storm_data[train_indices])
        np.save(join(out_path, "best_test_data.npy"), storm_data[test_indices])
        sample_scores = pool.apply(
            train_best_sklearn_model,
            (model_name, model_obj, best_combo, n, hail_labels[train_indices],
             storm_meta.loc[test_indices], hail_labels[test_indices],
             sample_scores, out_path))
        pool.close()
        pool.join()
        del pool
        sample_scores.to_csv(join(out_path,
                                  "{0}_sample_scores.csv".format(model_name)),
                             index_label="Sample")
        #print("Train Best " + model_name)
        #model_inst = model_obj(**param_combos.loc[best_config].to_dict())
        #model_inst.fit(storm_data[train_indices],
        #               hail_labels[train_indices])
        #print("Scoring " + model_name)
        #test_pred_frame = storm_meta.loc[test_indices]
        #test_pred_frame[model_name] = model_inst.predict_proba(storm_data[test_indices])[:, 1]
        #test_pred_frame["label"] = hail_labels[test_indices]
        #test_preds = test_pred_frame[model_name].values
        #test_pred_frame = pd.DataFrame({"indices": test_indices,
        # "lon": storm_centers[test_indices, 0],
        # "lat": storm_centers[test_indices, 1],
        # "run_dates": storm_run_dates[test_indices],
        # "valid_dates": storm_valid_dates[test_indices],
        # "members": storm_members[test_indices],
        # model_name: test_preds,
        # "label": hail_labels[test_indices]},
        #columns=["indices", "lon", "lat", "dates", "members", "conv_net", "label"])
        #test_pred_frame.to_csv(join(out_path, "predictions_{0}_sample_{1:03d}.csv".format(model_name, n)), index_label="Index")
        #sample_scores.loc[n, "Brier Score"] = brier_score(hail_labels[test_indices], test_preds)
        #sample_scores.loc[n, "Brier Score Climo"] = brier_score(hail_labels[test_indices],
        #                                                        hail_labels[test_indices].mean())
        #sample_scores.loc[n, "Brier Skill Score"] = brier_skill_score(hail_labels[test_indices], test_preds)
        #sample_scores.loc[n, "AUC"] = roc_auc_score(hail_labels[test_indices], test_preds)
        #
        #del model_inst
    #sample_scores.to_csv(join(out_path, "{0}_sample_scores.csv".format(model_name)), index_label="Sample")
    best_config_frame = param_combos.loc[best_param_combos]
    best_config_frame = best_config_frame.reset_index()
    best_config_frame.to_csv(join(out_path,
                                  "{0}_best_params.csv".format(model_name)),
                             index_label="Sample")
    return
Example #20
0
def calcProb(routes):
    i = 1
    for routeid, ti, tj, Dij, Dji in routes:
        print "Route %d/%d - ti: %d, tj: %d, Dij: %d, Dji: %d" % (
            i, len(routes), ti, tj, Dij, Dji)
        i += 1
        Dij = min(Dij, Dji)
        if Dij <= 0: Dij = 1
        # Edges ophalen
        con = Verbinding()
        try:
            sql = 'SELECT edge_id, db, df FROM prismedges WHERE route_id=%d' % routeid
            edges = con.selectAll(sql)
            # Route constants
            vmean = 1 / ((tj - ti) * 1.0)
            vmax = 1 / (Dij * 1.0)
            # XY uitrekenen
            edgevars = {}
            print 'Calculating XY...'
            manager = Manager()
            tq = manager.Queue()
            rq = manager.Queue()
            for edgeid, Db, Df in edges:
                if Db + Df == 0:
                    Df = 1
                tq.put((edgeid, Db, Df))
                edgevars[edgeid] = EdgeVar(edgeid, Db=Db, Df=Df)
            worker1 = ParabolaWorker(tq, rq, Dij, ti, tj)
            worker2 = ParabolaWorker(tq, rq, Dij, ti, tj)
            worker3 = ParabolaWorker(tq, rq, Dij, ti, tj)
            worker1.start()
            worker2.start()
            worker3.start()
            tq.put(None)
            tq.put(None)
            tq.put(None)
            worker1.join()
            worker2.join()
            worker3.join()
            while not rq.empty():
                edge = rq.get()
                k = edge[0]
                edgevars[k].tb, edgevars[k].tf, edgevars[k].x, edgevars[
                    k].y = edge[1:]
            worker1.terminate()
            worker2.terminate()
            worker3.terminate()
            #Iterate over time
            print 'Iterating over time...'
            edges = [(k, v.tb, v.tf) for k, v in edgevars.iteritems()]
            prism = Prism(edges)
            tq = manager.Queue()
            rq = manager.Queue()
            sq = manager.Queue(1)
            for t, edgelist in prism.iteratePrism(stepsize=30):
                tq.put((t, edgelist))
            worker1 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars)
            worker2 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars)
            worker3 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars)
            summator = SumWorker(rq, sq, edgevars)
            worker1.start()
            worker2.start()
            worker3.start()
            summator.start()
            tq.put(None)
            tq.put(None)
            tq.put(None)
            worker1.join()
            worker2.join()
            worker3.join()
            rq.put(None)
            summator.join(10)
            edgevars = sq.get()
            summator.terminate()
            worker1.terminate()
            worker2.terminate()
            worker3.terminate()
            #Copy edge to table
            routestr = ''
            for edge in edgevars.itervalues():
                if edge.P > 1.0: edge.P = 1
                routestr += '%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n' % (
                    routeid, edge.edgeid, edge.Db, edge.Df, edge.x, edge.y,
                    1.0 - exp(edge.P), edge.E)
            f = StringIO(routestr)
            con.copyfrom(f,
                         'probedges',
                         columns=('route_id', 'edge_id', 'db', 'df', 'x', 'y',
                                  'P', 'E'))
            con.commit()
        finally:
            con.sluit()
Example #21
0
                combined_advert = Advertiser(
                    name=duplicate_adverts.all()[0].name,
                    count=total_count,
                    broker_id=cur_broker.id)
                s.add(combined_advert)
                for advert in duplicate_adverts.all():
                    s.delete(advert)
                s.commit()


if __name__ == "__main__":
    session = Session()
    parse_file = 'events.ra.csv'
    num_workers = 4
    manager = Manager()
    work = manager.Queue(num_workers)

    # start for workers
    pool = []
    for i in range(num_workers):
        print("spawning process")
        p = Process(target=do_work, args=(work, ))
        p.start()
        pool.append(p)

    # produce data
    with open(parse_file, 'rt') as csvfile:
        check_loop = 10000
        timing = []
        reader = csv.reader(csvfile)
        parse = False
def iasi_level2_runner():
    """Listens and triggers processing"""

    LOG.info(
        "*** Start the extraction and conversion of ears iasi level2 profiles")

    pool = Pool(processes=6, maxtasksperchild=1)
    manager = Manager()
    listener_q = manager.Queue()
    publisher_q = manager.Queue()

    pub_thread = FilePublisher(publisher_q)
    pub_thread.start()
    listen_thread = FileListener(listener_q)
    listen_thread.start()

    jobs_dict = {}
    while True:

        try:
            msg = listener_q.get()
        except Empty:
            LOG.debug("Empty listener queue...")
            continue

        LOG.debug("Number of threads currently alive: " +
                  str(threading.active_count()))

        if 'start_time' in msg.data:
            start_time = msg.data['start_time']
        elif 'nominal_time' in msg.data:
            start_time = msg.data['nominal_time']
        else:
            LOG.warning("Neither start_time nor nominal_time in message!")
            start_time = None

        if 'end_time' in msg.data:
            end_time = msg.data['end_time']
        else:
            LOG.warning("No end_time in message!")
            if start_time:
                end_time = start_time + timedelta(seconds=60 * 15)
            else:
                end_time = None

        if not start_time or not end_time:
            LOG.warning("Missing either start_time or end_time or both!")
            LOG.warning("Ignore message and continue...")
            continue

        sensor = str(msg.data['sensor'])
        platform_name = msg.data['platform_name']

        keyname = (str(platform_name) + '_' +
                   str(start_time.strftime('%Y%m%d%H%M')))

        jobs_dict[keyname] = datetime.utcnow()

        urlobj = urlparse(msg.data['uri'])
        path, fname = os.path.split(urlobj.path)
        LOG.debug("path " + str(path) + " filename = " + str(fname))

        scene = {
            'platform_name': platform_name,
            'starttime': start_time,
            'endtime': end_time,
            'sensor': sensor,
            'filename': urlobj.path
        }

        # if keyname not in jobs_dict:
        #    LOG.warning("Scene-run seems unregistered! Forget it...")
        #    continue
        pool.apply_async(format_conversion,
                         (msg.data, scene, jobs_dict[keyname], publisher_q))

        # Block any future run on this scene for x minutes from now
        # x = 5
        thread_job_registry = threading.Timer(5 * 60.0,
                                              reset_job_registry,
                                              args=(jobs_dict, keyname))
        thread_job_registry.start()

    pool.close()
    pool.join()

    pub_thread.stop()
    listen_thread.stop()
Example #23
0
class LocalExecutor(BaseExecutor):
    """
    LocalExecutor executes tasks locally in parallel.
    It uses the multiprocessing Python library and queues to parallelize the execution
    of tasks.

    :param parallelism: how many parallel processes are run in the executor
    """
    def __init__(self, parallelism: int = PARALLELISM):
        super().__init__(parallelism=parallelism)
        self.manager: Optional[SyncManager] = None
        self.result_queue: Optional['Queue[TaskInstanceStateType]'] = None
        self.workers: List[QueuedLocalWorker] = []
        self.workers_used: int = 0
        self.workers_active: int = 0
        self.impl: Optional[Union['LocalExecutor.UnlimitedParallelism',
                                  'LocalExecutor.LimitedParallelism']] = None

    class UnlimitedParallelism:
        """
        Implements LocalExecutor with unlimited parallelism, starting one process
        per each command to execute.

        :param executor: the executor instance to implement.
        """
        def __init__(self, executor: 'LocalExecutor'):
            self.executor: 'LocalExecutor' = executor

        def start(self) -> None:
            """Starts the executor."""
            self.executor.workers_used = 0
            self.executor.workers_active = 0

        # pylint: disable=unused-argument # pragma: no cover
        def execute_async(
            self,
            key: TaskInstanceKey,
            command: CommandType,
            queue: Optional[str] = None,
            executor_config: Optional[Any] = None,
        ) -> None:
            """
            Executes task asynchronously.

            :param key: the key to identify the task instance
            :param command: the command to execute
            :param queue: Name of the queue
            :param executor_config: configuration for the executor
            """
            if not self.executor.result_queue:
                raise AirflowException(NOT_STARTED_MESSAGE)
            local_worker = LocalWorker(self.executor.result_queue,
                                       key=key,
                                       command=command)
            self.executor.workers_used += 1
            self.executor.workers_active += 1
            local_worker.start()

        # pylint: enable=unused-argument # pragma: no cover
        def sync(self) -> None:
            """Sync will get called periodically by the heartbeat method."""
            if not self.executor.result_queue:
                raise AirflowException("Executor should be started first")
            while not self.executor.result_queue.empty():
                results = self.executor.result_queue.get()
                self.executor.change_state(*results)
                self.executor.workers_active -= 1

        def end(self) -> None:
            """
            This method is called when the caller is done submitting job and
            wants to wait synchronously for the job submitted previously to be
            all done.
            """
            while self.executor.workers_active > 0:
                self.executor.sync()

    class LimitedParallelism:
        """
        Implements LocalExecutor with limited parallelism using a task queue to
        coordinate work distribution.

        :param executor: the executor instance to implement.
        """
        def __init__(self, executor: 'LocalExecutor'):
            self.executor: 'LocalExecutor' = executor
            self.queue: Optional['Queue[ExecutorWorkType]'] = None

        def start(self) -> None:
            """Starts limited parallelism implementation."""
            if not self.executor.manager:
                raise AirflowException(NOT_STARTED_MESSAGE)
            self.queue = self.executor.manager.Queue()
            if not self.executor.result_queue:
                raise AirflowException(NOT_STARTED_MESSAGE)
            self.executor.workers = [
                QueuedLocalWorker(self.queue, self.executor.result_queue)
                for _ in range(self.executor.parallelism)
            ]

            self.executor.workers_used = len(self.executor.workers)

            for worker in self.executor.workers:
                worker.start()

        def execute_async(
                self,
                key: TaskInstanceKey,
                command: CommandType,
                queue: Optional[str] = None,  # pylint: disable=unused-argument
                executor_config: Optional[Any] = None,  # pylint: disable=unused-argument
        ) -> None:
            """
            Executes task asynchronously.

            :param key: the key to identify the task instance
            :param command: the command to execute
            :param queue: name of the queue
            :param executor_config: configuration for the executor
            """
            if not self.queue:
                raise AirflowException(NOT_STARTED_MESSAGE)
            self.queue.put((key, command))

        def sync(self):
            """Sync will get called periodically by the heartbeat method."""
            while True:
                try:
                    results = self.executor.result_queue.get_nowait()
                    try:
                        self.executor.change_state(*results)
                    finally:
                        self.executor.result_queue.task_done()
                except Empty:
                    break

        def end(self):
            """Ends the executor. Sends the poison pill to all workers."""
            for _ in self.executor.workers:
                self.queue.put((None, None))

            # Wait for commands to finish
            self.queue.join()
            self.executor.sync()

    def start(self) -> None:
        """Starts the executor"""
        self.manager = Manager()
        self.result_queue = self.manager.Queue()
        self.workers = []
        self.workers_used = 0
        self.workers_active = 0
        self.impl = (LocalExecutor.UnlimitedParallelism(self)
                     if self.parallelism == 0 else
                     LocalExecutor.LimitedParallelism(self))

        self.impl.start()

    def execute_async(
        self,
        key: TaskInstanceKey,
        command: CommandType,
        queue: Optional[str] = None,
        executor_config: Optional[Any] = None,
    ) -> None:
        """Execute asynchronously."""
        if not self.impl:
            raise AirflowException(NOT_STARTED_MESSAGE)

        self.validate_command(command)

        self.impl.execute_async(key=key,
                                command=command,
                                queue=queue,
                                executor_config=executor_config)

    def sync(self) -> None:
        """Sync will get called periodically by the heartbeat method."""
        if not self.impl:
            raise AirflowException(NOT_STARTED_MESSAGE)
        self.impl.sync()

    def end(self) -> None:
        """
        Ends the executor.
        :return:
        """
        if not self.impl:
            raise AirflowException(NOT_STARTED_MESSAGE)
        if not self.manager:
            raise AirflowException(NOT_STARTED_MESSAGE)
        self.impl.end()
        self.manager.shutdown()

    def terminate(self):
        """Terminate the executor is not doing anything."""
Example #24
0
File: mp.py Project: notyourav/tp
def execute_tasks(process_count: int,
                  input_tasks: List[Tuple[Any, Any]],
                  shared: Dict[str, Any] = {},
                  callback: Any = None) -> List[Any]:
    """
    Creates 'process_count' processes that will together execute the provided tasks.
    """

    manager = Manager()
    results = [None] * len(input_tasks)

    if len(input_tasks) == 1:
        process_count = 0

    if process_count == 0:
        output = manager.Queue()
        for i, task in enumerate(input_tasks):
            context = MainContext(i, output)
            results[i] = task[0](context, *task[1], **shared)
            callback("complete", i)
            while not output.empty():
                command = output.get(block=True)
                callback(command[0], *command[1])

        return results

    input = manager.Queue()
    output = manager.Queue()
    timeout = 5 * 60  # if one single task takes more then 5 minutes, something is wrong

    # instead of copying state for each task, shared state is written to a file which is loaded once per process.
    shared_file = None
    temp_file = None
    if len(shared) > 0:
        context = MainContext(0, None)
        with TimeCode(context, "create_shared") as tc:
            temp_file = tempfile.NamedTemporaryFile("wb",
                                                    suffix='.dump',
                                                    prefix="mp_shared",
                                                    delete=True)
            shared_file = temp_file.name
            debug(f"shared file: '{temp_file.name}'")
            pickle_data = pickle.dumps(shared)
            temp_file.write(pickle_data)
            temp_file.flush()

    # add tasks to the task queue
    for i, task in enumerate(input_tasks):
        try:
            input.put((i, task))
        except:
            get_console().print_exception()
            error(i)
            error(task)
            fatal_exit()

    # create the processes
    processors = [
        Process(target=_process_entrypoint, args=(input, output, shared_file))
        for i in range(process_count)
    ]

    # start the processes
    for process in processors:
        process.start()

    # receive messages
    waiting = len(input_tasks)
    while waiting > 0:
        try:
            command = output.get(block=True, timeout=timeout)
            processing = True
            if callback:
                processing = callback(command[0], *command[1])
            if processing:
                if command[0] == 'debug':
                    debug(*command[1])
                elif command[0] == 'warning':
                    warning(*command[1])
                elif command[0] == 'error':
                    error(*command[1])
                elif command[0] == 'info':
                    info(*command[1])
                elif command[0] == 'complete':
                    results[command[1][0]] = command[1][1]
                    waiting -= 1
                elif command[0] == 'exception':
                    waiting -= 1
                    print(command[1][1])
                elif command[0] == 'exit':
                    sys.exit(1)
                else:
                    warning(f"unknown command: {command}")
        except Empty:
            error(f"task took to long to complete (+{timeout} seconds)")
            fatal_exit()

    # wait for all processes to finish
    for process in processors:
        process.join()

    # TODO: Maybe we don't need to clear the queue
    while not output.empty():
        command = output.get(block=False)
        warning(f"skipped command: {command}")

    if temp_file:
        temp_file.close()

    return results
Example #25
0
		od=''
	sfile='{}Summary_{}{}.csv'.format(od,ref[0:4],str(len(ref)))
	inffile='{}/PDB_info.txt'.format(odir)

	data=[]
	dheader=['PDB ID','Segment','Match length','Alignment','Cropped']
	data.append(dheader)
	data.append(['Ref','Ref',len(ref),ref,ref])

	info=[]
	iheader=['File','Protein segment','Ligand segment','Ligand name','Complex']
	info.append(iheader)

	p=Pool(cores)
	m = Manager()
	q = m.Queue()
	args=[]
	inps=ilist


	blist=slice_list(ilist,cores)

	for ifiles in blist:
		args.append((ifiles,ref,odir,l,ph,q))

	result = p.map_async(crop_handler, args)
	start=time.time()
	prcprev=0

	while True:
		if result.ready():
Example #26
0
#     if m.im_self is None:
#         return getattr, (m.im_class, m.im_func.func_name)
#     else:
#         return getattr, (m.im_self, m.im_func.func_name)

static_reg = re.compile(r'\.html$|\.htm$|\.shtml$|\.css$|\.png$|\.js$|\.dpg$|\.jpg$|\.svg$|\.jpeg$|'
                            r'\.gif$|\.webp$|\.ico$|\.woff$|\.ttf$|css\?|js\?|jpg\?|png\?|woff\?v='
                            r'|woff2\?v=|ttf\?|woff\?|woff2$|html\?v=|ico$')
burp_traffic = []
manager = Manager()
case_list = manager.list()
openner_result = manager.list()
# for deduplicate
# api_list=manager.list()
# filtered=manager.list()
traffic_queue = manager.Queue()
# for saving ot local file
traffic_list = manager.list()
# save reflect for analyzing
reflect_list = manager.list()
# filter
api_list = manager.list()

class Traffic_generator(Process):
    DEFAULT_HEADER = {
        'User-Agent': 'Mozilla/2.0 (X11; Linux x86_64) AppleWebKit/237.36 (KHTML, like Gecko) Chrome/62.0.3322.146 Safari/237.36',
    }

    def __init__(self, id, url_list,coroutine):
        Process.__init__(self)
        self.id = id
Example #27
0
def write(q):
    print 'Write...(%s)' % os.getpid()
    for v in ['A', 'B', 'C']:
        print 'Put %s to queue...' % v
        q.put(v)
        time.sleep(random.random())


def read(q):
    print 'Read...(%s)' % os.getpid()
    while True:
        if not q.empty():
            v = q.get(True)
            print 'Get %s from queue...' % v
            time.sleep(random.random())
        else:
            break


if __name__ == '__main__':
    manager = Manager()
    q = manager.Queue()
    p = Pool()
    p.apply_async(write, args=(q, ))
    time.sleep(0.5)
    p.apply_async(read, args=(q, ))
    p.close()
    p.join()
    print 'All data writed and readed.'
Example #28
0
 def _create_queues(self):
     # Need to use multiprocessing.Manager, the Queue() is buggy and causes
     # deadlock
     manager = Manager()
     self._chunk_queue = manager.Queue()
     self._result_queue = manager.Queue()
Example #29
0
def calculateRINsFromPdbList(pdbs, fromScratch=True, forceCentrality=True, remove_tmp_files=True, n_proc=32):

    pdbs = set([x.lower() for x in pdbs])

    lim = 100 * 1024 * 1024 * 1024

    resource.setrlimit(resource.RLIMIT_AS, (lim, lim))

    if not os.path.isfile(settings.REDUCE_HET_DICT):
        print("%s not found" % settings.REDUCE_HET_DICT)
        sys.exit(1)

    os.environ["REDUCE_HET_DICT"] = settings.REDUCE_HET_DICT

    num_of_proc = n_proc

    manager = Manager()
    lock = manager.Lock()

    in_queue = manager.Queue()

    bio_pdbs = set()

    total_structures = 0

    subfolders = os.listdir(bio_assembly_path)  # BUG: undefined variable
    for subfolder in subfolders:
        sub_path = "%s/%s" % (bio_assembly_path, subfolder)
        files = os.listdir(sub_path)
        if not os.path.exists("%s/%s" % (base_path, subfolder)):  # BUG: undefined variable
            os.mkdir("%s/%s" % (base_path, subfolder))

        for fn in files:
            if fn.count('.pdb1.gz') == 1:
                pdbgz_path = "%s/%s" % (sub_path, fn)
                if os.path.getsize(pdbgz_path) > 50 * 1024 * 1024:
                    continue
                pdb_id = fn.replace('.pdb1.gz', '')
                if pdb_id not in pdbs:
                    continue

                bio_pdbs.add(pdb_id)
                in_queue.put((pdbgz_path, pdb_id))
                total_structures += 1

    subfolders = os.listdir(AU_path)  # BUG: undefined variable
    for subfolder in subfolders:

        sub_path = "%s/%s" % (AU_path, subfolder)
        files = os.listdir(sub_path)
        if not os.path.exists("%s/%s" % (base_path, subfolder)):
            os.mkdir("%s/%s" % (base_path, subfolder))

        for fn in files:
            if fn.count('.ent.gz') == 1:
                pdbgz_path = "%s/%s" % (sub_path, fn)
                if os.path.getsize(pdbgz_path) > 50 * 1024 * 1024:
                    continue
                pdb_id = fn[3:7]
                if not '%s_au' % pdb_id in pdbs:
                    continue
                if pdb_id in bio_pdbs:
                    continue
                in_queue.put((pdbgz_path, pdb_id))
                total_structures += 1

    print('Amount of structures for RINerator: ', total_structures)

    processes = {}
    for i in range(1, num_of_proc + 1):
        p = Process(target=createRinProc, args=(in_queue, lock, fromScratch, i, forceCentrality, remove_tmp_files, base_path, rinerator_path, errorlog))  # BUG: undefined variable
        processes[i] = p
        print('Start RINerator Process: ', i)
        p.start()
    for i in processes:
        processes[i].join()
class BaseProcessor:

    _logger = None
    _file_handler = None
    _process_list = []
    _manager = None
    _event_queue = None
    _task_queue = None
    _process_count = 0
    _use_verbose_logging = False

    def __init__(self, file_handler, process_count, use_verbose_logging):
        self._file_handler = file_handler
        self._process_count = process_count
        self._use_verbose_logging = use_verbose_logging

        self._logger = Logger()
        self._manager = Manager()
        self._event_queue = self._manager.Queue()
        self._task_queue = self._manager.Queue()

    def _get_process(self, process_id):
        raise AttributeError("not supported")

    def _run_processes(self, items_to_process, event_handler_func,
                       event_handler_args):
        total_to_process = len(items_to_process)

        processes = self._initialize_processes()
        self._fill_task_queue(items_to_process)

        self._process_events(total_to_process, event_handler_func,
                             event_handler_args)

        self._stop_processes(processes)

    def _initialize_processes(self):
        processes = []
        for i in range(self._process_count):
            process = self._get_process(i)
            processes.append(process)
            process.start()

        return processes

    def _fill_task_queue(self, items):
        for item in items:
            self._task_queue.put(item)

    def _process_events(self, total_to_process, event_handler_func,
                        event_handler_args):
        num_processed = 0
        num_processed_by_process_list = [0] * self._process_count

        while True:
            self._write_progress_to_console(num_processed, total_to_process,
                                            num_processed_by_process_list)

            event = None
            try:
                event = self._event_queue.get(True, 1)
            except:
                pass

            if event is not None:
                args_to_use = (event, num_processed_by_process_list,
                               num_processed, total_to_process)
                args_to_use += event_handler_args

                num_processed = event_handler_func(*args_to_use)

            if num_processed >= total_to_process:
                break

    def _stop_processes(self, processes):
        for i in range(self._process_count):
            self._task_queue.put(-1)

        for process in processes:
            process.join()

    def _write_progress_to_console(self, num_processed, total_to_process,
                                   num_processed_by_process_list):
        output_str = "Progress: " + str(num_processed) + "/" + str(
            total_to_process) + "        "

        for i in range(len(num_processed_by_process_list)):
            output_str += ("P" + str(i) + ": " +
                           str(num_processed_by_process_list[i]) + "  ")

        sys.stdout.write(output_str + "\r")
        sys.stdout.flush()

    def _log_process_message(self, process_id, message):
        if self._use_verbose_logging:
            self._logger.print_log("[process: " + str(process_id) + "] " +
                                   message)