Example #1
0
    def fetch(self, job_name):
        """
        This method is used to fetch results from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.vcluster_config.get('job-metadata')[job_name]
        self.virt_cluster = self.vcluster_config.get('virtual-cluster')[
            job_metadata['cluster_name']]
        self.runtime_config = self.vcluster_config.get('runtime-config')[
            job_metadata['config_name']]
        loaded_all_pids = [tuple(x) for x in job_metadata['nodes-pids']]
        all_pids = Manager().list()
        all_pids.extend(loaded_all_pids)
        pool = Pool(processes=self.runtime_config['download_proc_num'])
        print("collecting results")
        while len(all_pids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel',
                                 job_metadata, node_pid_tuple, all_pids) for \
                                node_pid_tuple in loaded_all_pids if
                                node_pid_tuple in all_pids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")
Example #2
0
class Result:
    def __init__(self, urls_detail: dict, finished_urls: list,
                 failed_urls: list, config: Config, start_time, initial_time,
                 end_time):
        self.urls_detail = Manager().dict()
        self.urls_detail.update(urls_detail)
        self.finished_urls = Manager().list()
        self.finished_urls.extend(finished_urls)
        self.failed_urls = Manager().list()
        self.failed_urls.extend(failed_urls)
        self.config = copy.deepcopy(config)
        self.start_time = start_time
        self.initial_time = initial_time
        self.end_time = end_time

    def get_failed_urls(self):
        return self.failed_urls

    def get_finished_urls(self):
        return self.finished_urls

    def get_urls_detail_dict(self):
        return self.urls_detail

    def retry_failed_urls(self, *new_config: Config):
        if len(self.failed_urls) == 0:
            print("no failed urls")
            return True
        config = copy.deepcopy(new_config[0] if len(new_config) ==
                               1 else self.config)
        if len(new_config) == 1:
            config.list_config()
        retry_downloader = Downloader(config)
        result = retry_downloader.get_result(self.failed_urls)
        self.failed_urls = result.failed_urls
        for url in result.finished_urls:
            self.finished_urls.append(url)
        self.urls_detail.update(result.urls_detail)
        return True

    def show_time_cost(self):
        time_cost = '\n'.join([
            'initialize download tasks cost: {:.2f}s'.format(
                self.initial_time - self.start_time),
            'finish download task cost: {:.2f}s'.format(self.end_time -
                                                        self.initial_time),
            'total cost: {:.2f}s'.format(self.end_time - self.start_time)
        ])
        print(time_cost)

    def show_urls_status(self):
        urls_status = '|'.join([
            'finished: ' + str(len(self.finished_urls)),
            'failed: ' + str(len(self.failed_urls)),
            'total: ' + str(len(self.finished_urls) + len(self.failed_urls))
        ])
        print(urls_status)
Example #3
0
    def fetch(self, job_name):
        """
        This method is used to fetch results from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        self.slurm_cluster = self.batch_config.get('slurm_cluster')[job_metadata['slurm_cluster_name']]
        loaded_all_job_ids = [x for x in job_metadata['jobIDs']]
        all_job_ids = Manager().list()
        all_job_ids.extend(loaded_all_job_ids)
        pool = Pool(processes=1)
        print("collecting results")
        while len(all_job_ids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, jobID, all_job_ids) for \
                                jobID in loaded_all_job_ids if jobID in all_job_ids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")
Example #4
0
class BlockchainGateway(object):
    """
    Blockchain Gateway 
    The blockchain gateway listens to the blockchain and notifies the appropriate classes
    inside the Unix Service when there is relevant information ready for them. Follows
    an event-driven programming paradigm using a series of async loops for listening.
    In order for this to work, the following must be running:
        IPFS Daemon: `ipfs daemon`
        The lotion app: `node app_trivial.js` from dagora-chain
    For more specific instructions check travis.yaml to see how travis does it.
    """

    def __init__(self):
        """
        Initialize state, keys to empty lists. Everything else is left to configure().
        """
        self.state = Manager().list()
        self.event = Event()
        self.keys = []

    def configure(self, config_manager: object, communication_manager: object, 
                    ipfs_client: object, dataset_manager: object):
        """
        Add communication_manager, ipfs_client, and set port.
        """
        self.communication_manager = communication_manager
        self._dataset_manager = dataset_manager
        config = config_manager.get_config()
        self._host = config.get("BLOCKCHAIN", "host")
        self._port = config.getint("BLOCKCHAIN", "http_port")
        self._timeout = config.getint("BLOCKCHAIN", "timeout")
        self._client = ipfs_client

    # Public methods for CRON
    
    def start_cron(self, period_in_mins: float=0.05) -> None:
        """
        CRON method to listen. Runs asynchronously.
        """
        logging.info("Starting cron...")
        self._listen_as_event(
                        period_in_mins, 
                        self._handle_new_session_creation,
                        self._filter_new_session
        )

    def stop_cron(self) -> None:
        """
        Stop the CRON method.
        """
        self.event.set()
        logging.info("Cron stopped!")

    def reset(self) -> None:
        """
        Reset the gateway
        This causes the Scheduler/Runners to no longer influence the Gateway's state
        """
        self.event = Event()
        self.state = Manager().list()
        logging.info("Gateway reset!")

    def state_append(self, set_element):
        """
        Called by other Setter methods used in the rest of the service.
        Making sure that the service doesn't pick up weights that were
        already generated.
        """
        logging.info("appending to state: {}".format(set_element))
        self.state.append(set_element)
    # Private methods to manage listening

    def _update_local_state(self, filtered_diffs: list) -> None:
        """
        Helper function to update the local state with freshly downloaded global state.
        """
        self.state.extend(filtered_diffs)
    
    def _listen(self, callback: Callable, 
                event_filter: Callable) -> Tuple[Callable, Callable]:
        """
        Fetches the global state.
        Passes the global state to a filter to see all relevant transactions.
        Updates local state.
        If any relevant transactions found, returns the callback result.
        Else, returns the arguments it was passed.
        """
        global_state_wrapper = get_global_state(self._host, self._port, self._timeout)
        state_diffs, filtered_diffs = filter_diffs(global_state_wrapper, self.state, event_filter)
        # return filtered_diffs
        self._update_local_state(state_diffs)
        if filtered_diffs:
            return callback(filtered_diffs)
        else:
            return callback, event_filter

    def _listen_as_event(self, 
                        period_in_mins: float, 
                        callback: Callable, 
                        event_filter: Callable) -> None:
        """
        Trigger above method every period.
        """
        new_callback, event_filter = self._listen(callback, event_filter)
        if not self.event.is_set():
            Timer(
                period_in_mins * 60,
                self._listen_as_event,
                [period_in_mins, new_callback, event_filter]
            ).start()

    def _handle_new_session_creation(self, txs: list) -> Tuple[Callable, Callable]:
        """
        Maps the handler onto all relevant transactions.
        Then returns the next handler and filter.
        """
        def handler(tx):
            assert TxEnum.KEY.name in tx
            key = tx.get(TxEnum.KEY.name)
            value = tx.get(TxEnum.CONTENT.name)
            args = Transaction(MessageEventTypes.NEW_SESSION.name,
                                Transaction(ipfs_to_content(self._client, key),
                                            ipfs_to_content(self._client, value), 0).get_tx(),
                                0).get_tx()
            self.communication_manager.inform(RawEventTypes.NEW_MESSAGE.name, args)
        list(map(handler, txs))
        return self._handle_new_session_info, self._filter_new_session_info

    def _filter_new_session(self, tx: dict) -> bool:
        """
        Only allows new-session transactions through.
        """
        try:
            key_dict = ipfs_to_content(self._client, tx.get(TxEnum.KEY.name))
            return self._dataset_manager.validate_key(key_dict["dataset_uuid"]) and tx.get(TxEnum.ROUND.name) == 0
        except:
            return False

    def _filter_new_session_info(self, tx: dict) -> bool:
        """
        Only allows new-session-info transactions through.
        """
        return tx.get(TxEnum.ROUND.name) > 0

    def _handle_new_session_info(self, txs: list) -> Tuple[Callable, Callable]:
        """
        Maps the handler onto all relevant transactions.
        Then returns the next handler and filter.
        """
        def handler(tx):
            key = tx.get(TxEnum.KEY.name)
            value = tx.get(TxEnum.CONTENT.name)
            args = Transaction(MessageEventTypes.NEW_WEIGHTS.name,
                                ipfs_to_content(self._client, value), 0).get_tx()
            # TODO: Put into in-memory datastore.
            self.communication_manager.inform(
                RawEventTypes.NEW_MESSAGE.name,args)
        list(map(handler, txs))
        return self._handle_new_session_info, self._filter_new_session_info
class LazyPageThread(ABC):
    def __init__(
        self,
        keywords,
        category_url,
        page_range=[0, -1],
    ):
        print("Cleaning keywords")
        keywords = "%20".join(
            map(lambda item: item.strip(), keywords.split(',')))
        search_url = category_url + "&keyword=" + keywords
        print("Cleaned keywords: " + search_url)

        self.max_item = 0
        self.item_num = 0
        self.page_range = page_range
        self.url = search_url
        self.html = ""
        self.products = Manager().list()

        self.options = Options()
        self.options.add_argument("--headless")
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        self.options.add_argument("--window-size=1920x1080")
        self.options.add_argument("start-maximised")

    def run(self, keyClass, classForScroll, classForPageNumber):
        page = self.page_range[0]
        last_page = self.page_range[1]

        driver = webdriver.Chrome(ChromeDriverManager().install(),
                                  chrome_options=self.options)
        driver.get(self.url)

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, classForPageNumber)))
        finally:
            self.total_page = int(
                driver.find_element_by_class_name(classForPageNumber).text)
            driver.quit()

        cpu_num = psutil.cpu_count()

        print("Processor:::" + str(cpu_num))

        with Pool(processes=cpu_num) as pool:
            results = [
                pool.apply_async(
                    self.crawling,
                    (
                        self.url,
                        keyClass,
                        classForScroll,
                        i,
                        self.options,
                    ),
                ) for i in range(self.total_page)
            ]

            pool.close()
            pool.join()

        print(len(self.products))

    @abstractmethod
    def sort(self, value):
        pass

    def crawling(self, url, keyClass, classForScroll, page_num, options):
        url = url + "&page=" + str(page_num)
        driver = webdriver.Chrome(ChromeDriverManager().install(),
                                  chrome_options=options)
        driver.get(url)

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, keyClass)))
            scroll_down(
                driver, "document.getElementsByClassName('" + classForScroll +
                "')[0].clientHeight")
        finally:
            self.html = driver.page_source
            driver.quit()

        self.products.extend(self.handle_result())

    @abstractmethod
    def handle_result(self):
        pass
class SenseEmbedding(WordEmbedding.WordModel):
    """
        Implementation of Sense2Vec;  NP, VP and POS tag based embedding
        reference : http://arxiv.org/pdf/1511.06388v1.pdf
        """

    # DO NOT change this ordering, need to figure out a better way to achieve this
    senses = [
        'NOUN', 'VERB', 'ADJECTIVE', 'CONJUNCTION', 'CARDINAL', 'DEFAULT'
    ]

    def __init__(self, data_sources, workers, *args, **kwargs):
        """
        Sense2vec embedding
        :param data_sources: list of data sources to pull data from
        :param workers: number of processes to create in the pool
        """
        WordEmbedding.WordModel.__init__(self, *args, **kwargs)
        self.sources = data_sources
        self.annotator = tools.Annotator()
        self.workers = workers
        self.tokenized_blocks = Manager().list()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        self.word_to_tag = defaultdict(list)

    def form_tag_tokens(self):
        for word_tag in self.model.vocab:
            word, tag = word_tag.split("|")
            self.word_to_tag[word].append(tag)

    def get_tags_for_word(self, word):
        token_tags = self.word_to_tag.get(word, None)
        if not token_tags: return []
        return [word + "|" + tag for tag in token_tags]

    def tokenize(self, text_block):
        sense_phrases = sense_tokenize(text_block, self.annotator,
                                       self.stemmer, self.stop_words)
        self.tokenized_blocks.extend(sense_phrases)

    def get_sense_vec(self, entity, dimension, sense='NOUN'):

        if sense == 'NOUN':
            if self.model.vocab.has_key(entity + '|NOUN'):
                return self.model[entity + '|NOUN']

            elif self.model.vocab.has_key(entity + '|NP'):
                return self.model[entity + '|NP']

            else:
                entities = entity.split(" ")
                entity_vec = [
                    self.model[e + '|NOUN'] for e in entities
                    if e + '|NOUN' in self.model.vocab
                ]
                entity_vec.extend([
                    self.get_vector(e, dimension, 'NOUN') for e in entities
                    if e + '|NOUN' not in self.model.vocab
                ])
                return np.average(entity_vec, axis=0)

        else:
            if self.model.vocab.has_key(entity + '|VERB'):
                return self.model[entity + '|VERB']

            elif self.model.vocab.has_key(entity + '|VP'):
                return self.model[entity + '|VP']

            else:
                entities = entity.split(" ")
                entity_vec = [
                    self.model[e + '|VERB'] for e in entities
                    if e + '|VERB' in self.model.vocab
                ]
                entity_vec.extend([
                    self.get_vector(e, dimension, 'VERB') for e in entities
                    if e + '|VERB' not in self.model.vocab
                ])
                return np.average(entity_vec, axis=0)

    def get_vector(self, word, dimension, sense_except='NOUN'):

        words = [word] * (len(SenseEmbedding.senses) - 1)
        senses = list(SenseEmbedding.senses)
        senses.remove(sense_except)
        word_with_sense = [w + '|' + s for w, s in zip(words, senses)]
        for word in word_with_sense:
            if self.model.vocab.has_key(word):
                return self.model[word]

        return np.random.normal(0, 1, dimension)

    def form_model(self):
        text_blocks = []
        for source in self.sources:
            source.start()

        logger.info("Reading the text blocks from the source")
        for item_tuple in chain(*self.sources):
            if not item_tuple:
                logger.warn("item read from source is empty")
                continue

            item = " ".join([t[1] for t in item_tuple])
            if item == '': continue
            text_blocks.append(item)

        logger.info("Read all the text blocks")
        logger.info("Number of text blocks read : %d" % len(text_blocks))
        logger.info("will sentence and word tokenize the text blocks")

        pool = Pool(processes=self.workers)
        pool.map(self.tokenize, text_blocks, chunksize=2 * self.workers)
        pool.close()
        pool.join()
        self.batch_train(text_blocks=self.tokenized_blocks, tokenized=True)
        # form the token to tags map
        self.form_tag_tokens()
class SenseEmbedding(WordEmbedding.WordModel):
    """
        Implementation of Sense2Vec;  NP, VP and POS tag based embedding
        reference : http://arxiv.org/pdf/1511.06388v1.pdf
        """

    # DO NOT change this ordering, need to figure out a better way to achieve this
    senses = ['NOUN', 'VERB', 'ADJECTIVE', 'CONJUNCTION', 'CARDINAL', 'DEFAULT']

    def __init__(self, data_sources, workers, *args, **kwargs):
        """
        Sense2vec embedding
        :param data_sources: list of data sources to pull data from
        :param workers: number of processes to create in the pool
        """
        WordEmbedding.WordModel.__init__(self, *args, **kwargs)
        self.sources = data_sources
        self.annotator = tools.Annotator()
        self.workers = workers
        self.tokenized_blocks = Manager().list()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        self.word_to_tag = defaultdict(list)

    def form_tag_tokens(self):
        for word_tag in self.model.vocab:
            word, tag = word_tag.split("|")
            self.word_to_tag[word].append(tag)

    def get_tags_for_word(self, word):
        token_tags = self.word_to_tag.get(word, None)
        if not token_tags: return []
        return [word + "|" + tag for tag in token_tags]

    def tokenize(self, text_block):
        sense_phrases = sense_tokenize(text_block, self.annotator, self.stemmer, self.stop_words)
        self.tokenized_blocks.extend(sense_phrases)

    def get_sense_vec(self, entity, dimension, sense='NOUN'):

        if sense == 'NOUN':
            if self.model.vocab.has_key(entity + '|NOUN'):
                return self.model[entity + '|NOUN']

            elif self.model.vocab.has_key(entity + '|NP'):
                return self.model[entity + '|NP']

            else:
                entities = entity.split(" ")
                entity_vec = [self.model[e + '|NOUN'] for e in entities if e + '|NOUN'
                              in self.model.vocab]
                entity_vec.extend([self.get_vector(e, dimension, 'NOUN') for e in entities
                                   if e + '|NOUN' not in self.model.vocab])
                return np.average(entity_vec, axis=0)

        else:
            if self.model.vocab.has_key(entity + '|VERB'):
                return self.model[entity + '|VERB']

            elif self.model.vocab.has_key(entity + '|VP'):
                return self.model[entity + '|VP']

            else:
                entities = entity.split(" ")
                entity_vec = [self.model[e + '|VERB'] for e in entities if e + '|VERB'
                              in self.model.vocab]
                entity_vec.extend([self.get_vector(e, dimension, 'VERB') for e in entities
                                   if e + '|VERB' not in self.model.vocab])
                return np.average(entity_vec, axis=0)

    def get_vector(self, word, dimension, sense_except='NOUN'):

        words = [word] * (len(SenseEmbedding.senses) - 1)
        senses = list(SenseEmbedding.senses)
        senses.remove(sense_except)
        word_with_sense = [w + '|' + s for w,s in zip(words, senses)]
        for word in word_with_sense:
            if self.model.vocab.has_key(word):
                return self.model[word]

        return np.random.normal(0, 1, dimension)

    def form_model(self):
        text_blocks = []
        for source in self.sources:
            source.start()

        logger.info("Reading the text blocks from the source")
        for item_tuple in chain(*self.sources):
            if not item_tuple:
                logger.warn("item read from source is empty")
                continue

            item = " ".join([t[1] for t in item_tuple])
            if item == '': continue
            text_blocks.append(item)

        logger.info("Read all the text blocks")
        logger.info("Number of text blocks read : %d" % len(text_blocks))
        logger.info("will sentence and word tokenize the text blocks")

        pool = Pool(processes=self.workers)
        pool.map(self.tokenize, text_blocks,chunksize=2*self.workers)
        pool.close()
        pool.join()
        self.batch_train(text_blocks=self.tokenized_blocks, tokenized=True)
        # form the token to tags map
        self.form_tag_tokens()
                pickle.dump(metadata , ff, protocol=pickle.HIGHEST_PROTOCOL)
            print ("metadata saved.")
        if nodownload == False:
            parallel_jobs.sync_pids_with_config()
            pool = Pool(processes=process_num_collect)
            print("collecting results")
            while len(all_pids)> 0 :
                time.sleep(3)
                all_running_jobs = [(parallel_jobs, n_idx, n, all_pids) for n_idx, n in enumerate(parallel_jobs.config) if (n,parallel_jobs.config[n]['pid']) in all_pids]
                pool.map(collect_results_in_parallel, all_running_jobs)
                print ("waiting for other results if any...")

            print("All of the remote results collected.")
    else:
        if not os.path.isfile(metadatapath):
            raise FileExistsError("The metadata file %s does not exist."%metadatapath)
        parallel_jobs = run_in_parallel('','',metarun=True)
        with  open(metadatapath, "rb")  as ff:
            metadata = pickle.load(ff)
        parallel_jobs.load_metadata(metadata)
        all_pids = Manager().list()
        all_pids.extend(parallel_jobs.all_pids)
        parallel_jobs.all_pids = all_pids
        pool = Pool(processes=process_num_collect)
        print("collecting results")
        while len(all_pids)> 0 :
            time.sleep(3)
            all_running_jobs = [(parallel_jobs, n_idx, n, all_pids) for n_idx, n in enumerate(parallel_jobs.config) if (n,parallel_jobs.config[n]['pid']) in all_pids]
            pool.map(collect_results_in_parallel, all_running_jobs)
            print ("waiting for other results if any...")
        print("All of the remote results collected.")
Example #9
0
class ProofOfWork(object):
    """
    class to solve Proof of Work with SHA1 hash
    """
    def __init__(self, base_string, difficulty, unwanted_chars=None):
        """
        :param base_string: base string to solve POW with
        :param difficulty:  difficulty level to solve for
        :param unwanted_chars: unwanted utf-8 characters, if any.
                                For eg: ('\n', '\r', '\t', ' ') ==> [0x0a, 0x0d, 0x09, 0x20]
        """
        if unwanted_chars is None:
            unwanted_chars = list()

        # Use multiprocessing Event to signal that hash has been found,
        # multiprocessing Value slows down the worker due to shared memory
        self.solved = Event()
        # Use multiprocessing Manager List to send the result suffix string back to calling process
        self.result = Manager().list()

        self.char_set = [
            x for x in range(0x00, 0x100) if x not in unwanted_chars
        ]

        self.total_cpus = cpu_count()
        """
        independent string_generators to be used in different processes
        Generator 1: Generate strings of length 0, 8, 16, ...
        Generator 2: Generate strings of length 1, 9, 17, ...
                        .
                        .
                        .
        """
        self.string_generators = list()
        for i in range(self.total_cpus):
            self.string_generators.append(
                self._get_string_from_char_set(i, self.total_cpus))

        self.process_list = list()
        for index in range(self.total_cpus):
            self.process_list.append(
                Process(target=self._worker,
                        args=(base_string, self.string_generators[index],
                              difficulty)))

        self.start_time = time()

    def _sha1_digest(self, string):
        return sha1(string).hexdigest()

    def _get_string_from_char_set(self, n, offset):
        """
        Yield strings in the char_set in increasing order of length
        """
        while True:
            for p in product(self.char_set, repeat=n):
                yield bytearray(p)
            n += offset

    def _worker(self, base_string, string_generator, difficulty):
        """
        Worker thread to solve pow in parallel threads
        Prints log when the digest with preceding zeros more than 5
        :param base_string: Base String to use for proof of work
        :param string_generator: Generator method to generate strings
        :param difficulty: difficulty level to solve the base_string for
        :return:
        """
        while True:
            suffix = next(string_generator)
            hex_digest = self._sha1_digest(base_string + suffix)
            if hex_digest.startswith('0' * difficulty):
                print(
                    "%s: Time: %s mins, String: %s, Hash: %s..." %
                    (current_process().name,
                     (time() - self.start_time) / 60, suffix, hex_digest[:10]))
                # only add the first found results in case two different threads solve at same time
                if len(self.result) == 0:
                    self.result.extend(suffix)
                self.solved.set()
                break

    def solve(self):
        """
        Spawn different processes to parallely solve pow
        """
        start_time = time()
        for p in self.process_list:
            p.start()

        # wait for solved event to be set by any worker thread.
        self.solved.wait()

        # Terminate and join all processes
        for p in self.process_list:
            p.terminate()
            p.join()
        print("Joined in", (time() - start_time) / 60, "mins")
        return bytearray(self.result)