def fetch(self, job_name): """ This method is used to fetch results from remote nodes :param job_name: the previously submitted job name :return: """ job_metadata = self.vcluster_config.get('job-metadata')[job_name] self.virt_cluster = self.vcluster_config.get('virtual-cluster')[ job_metadata['cluster_name']] self.runtime_config = self.vcluster_config.get('runtime-config')[ job_metadata['config_name']] loaded_all_pids = [tuple(x) for x in job_metadata['nodes-pids']] all_pids = Manager().list() all_pids.extend(loaded_all_pids) pool = Pool(processes=self.runtime_config['download_proc_num']) print("collecting results") while len(all_pids) > 0: time.sleep(1) all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, node_pid_tuple, all_pids) for \ node_pid_tuple in loaded_all_pids if node_pid_tuple in all_pids] pool.map(self._execute_in_parallel, all_running_jobs) print("waiting for other results if any...") print("All of the remote results collected.")
class Result: def __init__(self, urls_detail: dict, finished_urls: list, failed_urls: list, config: Config, start_time, initial_time, end_time): self.urls_detail = Manager().dict() self.urls_detail.update(urls_detail) self.finished_urls = Manager().list() self.finished_urls.extend(finished_urls) self.failed_urls = Manager().list() self.failed_urls.extend(failed_urls) self.config = copy.deepcopy(config) self.start_time = start_time self.initial_time = initial_time self.end_time = end_time def get_failed_urls(self): return self.failed_urls def get_finished_urls(self): return self.finished_urls def get_urls_detail_dict(self): return self.urls_detail def retry_failed_urls(self, *new_config: Config): if len(self.failed_urls) == 0: print("no failed urls") return True config = copy.deepcopy(new_config[0] if len(new_config) == 1 else self.config) if len(new_config) == 1: config.list_config() retry_downloader = Downloader(config) result = retry_downloader.get_result(self.failed_urls) self.failed_urls = result.failed_urls for url in result.finished_urls: self.finished_urls.append(url) self.urls_detail.update(result.urls_detail) return True def show_time_cost(self): time_cost = '\n'.join([ 'initialize download tasks cost: {:.2f}s'.format( self.initial_time - self.start_time), 'finish download task cost: {:.2f}s'.format(self.end_time - self.initial_time), 'total cost: {:.2f}s'.format(self.end_time - self.start_time) ]) print(time_cost) def show_urls_status(self): urls_status = '|'.join([ 'finished: ' + str(len(self.finished_urls)), 'failed: ' + str(len(self.failed_urls)), 'total: ' + str(len(self.finished_urls) + len(self.failed_urls)) ]) print(urls_status)
def fetch(self, job_name): """ This method is used to fetch results from remote nodes :param job_name: the previously submitted job name :return: """ job_metadata = self.batch_config.get('job-metadata')[job_name] self.slurm_cluster = self.batch_config.get('slurm_cluster')[job_metadata['slurm_cluster_name']] loaded_all_job_ids = [x for x in job_metadata['jobIDs']] all_job_ids = Manager().list() all_job_ids.extend(loaded_all_job_ids) pool = Pool(processes=1) print("collecting results") while len(all_job_ids) > 0: time.sleep(1) all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, jobID, all_job_ids) for \ jobID in loaded_all_job_ids if jobID in all_job_ids] pool.map(self._execute_in_parallel, all_running_jobs) print("waiting for other results if any...") print("All of the remote results collected.")
class BlockchainGateway(object): """ Blockchain Gateway The blockchain gateway listens to the blockchain and notifies the appropriate classes inside the Unix Service when there is relevant information ready for them. Follows an event-driven programming paradigm using a series of async loops for listening. In order for this to work, the following must be running: IPFS Daemon: `ipfs daemon` The lotion app: `node app_trivial.js` from dagora-chain For more specific instructions check travis.yaml to see how travis does it. """ def __init__(self): """ Initialize state, keys to empty lists. Everything else is left to configure(). """ self.state = Manager().list() self.event = Event() self.keys = [] def configure(self, config_manager: object, communication_manager: object, ipfs_client: object, dataset_manager: object): """ Add communication_manager, ipfs_client, and set port. """ self.communication_manager = communication_manager self._dataset_manager = dataset_manager config = config_manager.get_config() self._host = config.get("BLOCKCHAIN", "host") self._port = config.getint("BLOCKCHAIN", "http_port") self._timeout = config.getint("BLOCKCHAIN", "timeout") self._client = ipfs_client # Public methods for CRON def start_cron(self, period_in_mins: float=0.05) -> None: """ CRON method to listen. Runs asynchronously. """ logging.info("Starting cron...") self._listen_as_event( period_in_mins, self._handle_new_session_creation, self._filter_new_session ) def stop_cron(self) -> None: """ Stop the CRON method. """ self.event.set() logging.info("Cron stopped!") def reset(self) -> None: """ Reset the gateway This causes the Scheduler/Runners to no longer influence the Gateway's state """ self.event = Event() self.state = Manager().list() logging.info("Gateway reset!") def state_append(self, set_element): """ Called by other Setter methods used in the rest of the service. Making sure that the service doesn't pick up weights that were already generated. """ logging.info("appending to state: {}".format(set_element)) self.state.append(set_element) # Private methods to manage listening def _update_local_state(self, filtered_diffs: list) -> None: """ Helper function to update the local state with freshly downloaded global state. """ self.state.extend(filtered_diffs) def _listen(self, callback: Callable, event_filter: Callable) -> Tuple[Callable, Callable]: """ Fetches the global state. Passes the global state to a filter to see all relevant transactions. Updates local state. If any relevant transactions found, returns the callback result. Else, returns the arguments it was passed. """ global_state_wrapper = get_global_state(self._host, self._port, self._timeout) state_diffs, filtered_diffs = filter_diffs(global_state_wrapper, self.state, event_filter) # return filtered_diffs self._update_local_state(state_diffs) if filtered_diffs: return callback(filtered_diffs) else: return callback, event_filter def _listen_as_event(self, period_in_mins: float, callback: Callable, event_filter: Callable) -> None: """ Trigger above method every period. """ new_callback, event_filter = self._listen(callback, event_filter) if not self.event.is_set(): Timer( period_in_mins * 60, self._listen_as_event, [period_in_mins, new_callback, event_filter] ).start() def _handle_new_session_creation(self, txs: list) -> Tuple[Callable, Callable]: """ Maps the handler onto all relevant transactions. Then returns the next handler and filter. """ def handler(tx): assert TxEnum.KEY.name in tx key = tx.get(TxEnum.KEY.name) value = tx.get(TxEnum.CONTENT.name) args = Transaction(MessageEventTypes.NEW_SESSION.name, Transaction(ipfs_to_content(self._client, key), ipfs_to_content(self._client, value), 0).get_tx(), 0).get_tx() self.communication_manager.inform(RawEventTypes.NEW_MESSAGE.name, args) list(map(handler, txs)) return self._handle_new_session_info, self._filter_new_session_info def _filter_new_session(self, tx: dict) -> bool: """ Only allows new-session transactions through. """ try: key_dict = ipfs_to_content(self._client, tx.get(TxEnum.KEY.name)) return self._dataset_manager.validate_key(key_dict["dataset_uuid"]) and tx.get(TxEnum.ROUND.name) == 0 except: return False def _filter_new_session_info(self, tx: dict) -> bool: """ Only allows new-session-info transactions through. """ return tx.get(TxEnum.ROUND.name) > 0 def _handle_new_session_info(self, txs: list) -> Tuple[Callable, Callable]: """ Maps the handler onto all relevant transactions. Then returns the next handler and filter. """ def handler(tx): key = tx.get(TxEnum.KEY.name) value = tx.get(TxEnum.CONTENT.name) args = Transaction(MessageEventTypes.NEW_WEIGHTS.name, ipfs_to_content(self._client, value), 0).get_tx() # TODO: Put into in-memory datastore. self.communication_manager.inform( RawEventTypes.NEW_MESSAGE.name,args) list(map(handler, txs)) return self._handle_new_session_info, self._filter_new_session_info
class LazyPageThread(ABC): def __init__( self, keywords, category_url, page_range=[0, -1], ): print("Cleaning keywords") keywords = "%20".join( map(lambda item: item.strip(), keywords.split(','))) search_url = category_url + "&keyword=" + keywords print("Cleaned keywords: " + search_url) self.max_item = 0 self.item_num = 0 self.page_range = page_range self.url = search_url self.html = "" self.products = Manager().list() self.options = Options() self.options.add_argument("--headless") self.options.add_argument("--no-sandbox") self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--window-size=1920x1080") self.options.add_argument("start-maximised") def run(self, keyClass, classForScroll, classForPageNumber): page = self.page_range[0] last_page = self.page_range[1] driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=self.options) driver.get(self.url) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, classForPageNumber))) finally: self.total_page = int( driver.find_element_by_class_name(classForPageNumber).text) driver.quit() cpu_num = psutil.cpu_count() print("Processor:::" + str(cpu_num)) with Pool(processes=cpu_num) as pool: results = [ pool.apply_async( self.crawling, ( self.url, keyClass, classForScroll, i, self.options, ), ) for i in range(self.total_page) ] pool.close() pool.join() print(len(self.products)) @abstractmethod def sort(self, value): pass def crawling(self, url, keyClass, classForScroll, page_num, options): url = url + "&page=" + str(page_num) driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options) driver.get(url) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, keyClass))) scroll_down( driver, "document.getElementsByClassName('" + classForScroll + "')[0].clientHeight") finally: self.html = driver.page_source driver.quit() self.products.extend(self.handle_result()) @abstractmethod def handle_result(self): pass
class SenseEmbedding(WordEmbedding.WordModel): """ Implementation of Sense2Vec; NP, VP and POS tag based embedding reference : http://arxiv.org/pdf/1511.06388v1.pdf """ # DO NOT change this ordering, need to figure out a better way to achieve this senses = [ 'NOUN', 'VERB', 'ADJECTIVE', 'CONJUNCTION', 'CARDINAL', 'DEFAULT' ] def __init__(self, data_sources, workers, *args, **kwargs): """ Sense2vec embedding :param data_sources: list of data sources to pull data from :param workers: number of processes to create in the pool """ WordEmbedding.WordModel.__init__(self, *args, **kwargs) self.sources = data_sources self.annotator = tools.Annotator() self.workers = workers self.tokenized_blocks = Manager().list() self.stemmer = PorterStemmer() self.stop_words = set(stopwords.words('english')) self.word_to_tag = defaultdict(list) def form_tag_tokens(self): for word_tag in self.model.vocab: word, tag = word_tag.split("|") self.word_to_tag[word].append(tag) def get_tags_for_word(self, word): token_tags = self.word_to_tag.get(word, None) if not token_tags: return [] return [word + "|" + tag for tag in token_tags] def tokenize(self, text_block): sense_phrases = sense_tokenize(text_block, self.annotator, self.stemmer, self.stop_words) self.tokenized_blocks.extend(sense_phrases) def get_sense_vec(self, entity, dimension, sense='NOUN'): if sense == 'NOUN': if self.model.vocab.has_key(entity + '|NOUN'): return self.model[entity + '|NOUN'] elif self.model.vocab.has_key(entity + '|NP'): return self.model[entity + '|NP'] else: entities = entity.split(" ") entity_vec = [ self.model[e + '|NOUN'] for e in entities if e + '|NOUN' in self.model.vocab ] entity_vec.extend([ self.get_vector(e, dimension, 'NOUN') for e in entities if e + '|NOUN' not in self.model.vocab ]) return np.average(entity_vec, axis=0) else: if self.model.vocab.has_key(entity + '|VERB'): return self.model[entity + '|VERB'] elif self.model.vocab.has_key(entity + '|VP'): return self.model[entity + '|VP'] else: entities = entity.split(" ") entity_vec = [ self.model[e + '|VERB'] for e in entities if e + '|VERB' in self.model.vocab ] entity_vec.extend([ self.get_vector(e, dimension, 'VERB') for e in entities if e + '|VERB' not in self.model.vocab ]) return np.average(entity_vec, axis=0) def get_vector(self, word, dimension, sense_except='NOUN'): words = [word] * (len(SenseEmbedding.senses) - 1) senses = list(SenseEmbedding.senses) senses.remove(sense_except) word_with_sense = [w + '|' + s for w, s in zip(words, senses)] for word in word_with_sense: if self.model.vocab.has_key(word): return self.model[word] return np.random.normal(0, 1, dimension) def form_model(self): text_blocks = [] for source in self.sources: source.start() logger.info("Reading the text blocks from the source") for item_tuple in chain(*self.sources): if not item_tuple: logger.warn("item read from source is empty") continue item = " ".join([t[1] for t in item_tuple]) if item == '': continue text_blocks.append(item) logger.info("Read all the text blocks") logger.info("Number of text blocks read : %d" % len(text_blocks)) logger.info("will sentence and word tokenize the text blocks") pool = Pool(processes=self.workers) pool.map(self.tokenize, text_blocks, chunksize=2 * self.workers) pool.close() pool.join() self.batch_train(text_blocks=self.tokenized_blocks, tokenized=True) # form the token to tags map self.form_tag_tokens()
class SenseEmbedding(WordEmbedding.WordModel): """ Implementation of Sense2Vec; NP, VP and POS tag based embedding reference : http://arxiv.org/pdf/1511.06388v1.pdf """ # DO NOT change this ordering, need to figure out a better way to achieve this senses = ['NOUN', 'VERB', 'ADJECTIVE', 'CONJUNCTION', 'CARDINAL', 'DEFAULT'] def __init__(self, data_sources, workers, *args, **kwargs): """ Sense2vec embedding :param data_sources: list of data sources to pull data from :param workers: number of processes to create in the pool """ WordEmbedding.WordModel.__init__(self, *args, **kwargs) self.sources = data_sources self.annotator = tools.Annotator() self.workers = workers self.tokenized_blocks = Manager().list() self.stemmer = PorterStemmer() self.stop_words = set(stopwords.words('english')) self.word_to_tag = defaultdict(list) def form_tag_tokens(self): for word_tag in self.model.vocab: word, tag = word_tag.split("|") self.word_to_tag[word].append(tag) def get_tags_for_word(self, word): token_tags = self.word_to_tag.get(word, None) if not token_tags: return [] return [word + "|" + tag for tag in token_tags] def tokenize(self, text_block): sense_phrases = sense_tokenize(text_block, self.annotator, self.stemmer, self.stop_words) self.tokenized_blocks.extend(sense_phrases) def get_sense_vec(self, entity, dimension, sense='NOUN'): if sense == 'NOUN': if self.model.vocab.has_key(entity + '|NOUN'): return self.model[entity + '|NOUN'] elif self.model.vocab.has_key(entity + '|NP'): return self.model[entity + '|NP'] else: entities = entity.split(" ") entity_vec = [self.model[e + '|NOUN'] for e in entities if e + '|NOUN' in self.model.vocab] entity_vec.extend([self.get_vector(e, dimension, 'NOUN') for e in entities if e + '|NOUN' not in self.model.vocab]) return np.average(entity_vec, axis=0) else: if self.model.vocab.has_key(entity + '|VERB'): return self.model[entity + '|VERB'] elif self.model.vocab.has_key(entity + '|VP'): return self.model[entity + '|VP'] else: entities = entity.split(" ") entity_vec = [self.model[e + '|VERB'] for e in entities if e + '|VERB' in self.model.vocab] entity_vec.extend([self.get_vector(e, dimension, 'VERB') for e in entities if e + '|VERB' not in self.model.vocab]) return np.average(entity_vec, axis=0) def get_vector(self, word, dimension, sense_except='NOUN'): words = [word] * (len(SenseEmbedding.senses) - 1) senses = list(SenseEmbedding.senses) senses.remove(sense_except) word_with_sense = [w + '|' + s for w,s in zip(words, senses)] for word in word_with_sense: if self.model.vocab.has_key(word): return self.model[word] return np.random.normal(0, 1, dimension) def form_model(self): text_blocks = [] for source in self.sources: source.start() logger.info("Reading the text blocks from the source") for item_tuple in chain(*self.sources): if not item_tuple: logger.warn("item read from source is empty") continue item = " ".join([t[1] for t in item_tuple]) if item == '': continue text_blocks.append(item) logger.info("Read all the text blocks") logger.info("Number of text blocks read : %d" % len(text_blocks)) logger.info("will sentence and word tokenize the text blocks") pool = Pool(processes=self.workers) pool.map(self.tokenize, text_blocks,chunksize=2*self.workers) pool.close() pool.join() self.batch_train(text_blocks=self.tokenized_blocks, tokenized=True) # form the token to tags map self.form_tag_tokens()
pickle.dump(metadata , ff, protocol=pickle.HIGHEST_PROTOCOL) print ("metadata saved.") if nodownload == False: parallel_jobs.sync_pids_with_config() pool = Pool(processes=process_num_collect) print("collecting results") while len(all_pids)> 0 : time.sleep(3) all_running_jobs = [(parallel_jobs, n_idx, n, all_pids) for n_idx, n in enumerate(parallel_jobs.config) if (n,parallel_jobs.config[n]['pid']) in all_pids] pool.map(collect_results_in_parallel, all_running_jobs) print ("waiting for other results if any...") print("All of the remote results collected.") else: if not os.path.isfile(metadatapath): raise FileExistsError("The metadata file %s does not exist."%metadatapath) parallel_jobs = run_in_parallel('','',metarun=True) with open(metadatapath, "rb") as ff: metadata = pickle.load(ff) parallel_jobs.load_metadata(metadata) all_pids = Manager().list() all_pids.extend(parallel_jobs.all_pids) parallel_jobs.all_pids = all_pids pool = Pool(processes=process_num_collect) print("collecting results") while len(all_pids)> 0 : time.sleep(3) all_running_jobs = [(parallel_jobs, n_idx, n, all_pids) for n_idx, n in enumerate(parallel_jobs.config) if (n,parallel_jobs.config[n]['pid']) in all_pids] pool.map(collect_results_in_parallel, all_running_jobs) print ("waiting for other results if any...") print("All of the remote results collected.")
class ProofOfWork(object): """ class to solve Proof of Work with SHA1 hash """ def __init__(self, base_string, difficulty, unwanted_chars=None): """ :param base_string: base string to solve POW with :param difficulty: difficulty level to solve for :param unwanted_chars: unwanted utf-8 characters, if any. For eg: ('\n', '\r', '\t', ' ') ==> [0x0a, 0x0d, 0x09, 0x20] """ if unwanted_chars is None: unwanted_chars = list() # Use multiprocessing Event to signal that hash has been found, # multiprocessing Value slows down the worker due to shared memory self.solved = Event() # Use multiprocessing Manager List to send the result suffix string back to calling process self.result = Manager().list() self.char_set = [ x for x in range(0x00, 0x100) if x not in unwanted_chars ] self.total_cpus = cpu_count() """ independent string_generators to be used in different processes Generator 1: Generate strings of length 0, 8, 16, ... Generator 2: Generate strings of length 1, 9, 17, ... . . . """ self.string_generators = list() for i in range(self.total_cpus): self.string_generators.append( self._get_string_from_char_set(i, self.total_cpus)) self.process_list = list() for index in range(self.total_cpus): self.process_list.append( Process(target=self._worker, args=(base_string, self.string_generators[index], difficulty))) self.start_time = time() def _sha1_digest(self, string): return sha1(string).hexdigest() def _get_string_from_char_set(self, n, offset): """ Yield strings in the char_set in increasing order of length """ while True: for p in product(self.char_set, repeat=n): yield bytearray(p) n += offset def _worker(self, base_string, string_generator, difficulty): """ Worker thread to solve pow in parallel threads Prints log when the digest with preceding zeros more than 5 :param base_string: Base String to use for proof of work :param string_generator: Generator method to generate strings :param difficulty: difficulty level to solve the base_string for :return: """ while True: suffix = next(string_generator) hex_digest = self._sha1_digest(base_string + suffix) if hex_digest.startswith('0' * difficulty): print( "%s: Time: %s mins, String: %s, Hash: %s..." % (current_process().name, (time() - self.start_time) / 60, suffix, hex_digest[:10])) # only add the first found results in case two different threads solve at same time if len(self.result) == 0: self.result.extend(suffix) self.solved.set() break def solve(self): """ Spawn different processes to parallely solve pow """ start_time = time() for p in self.process_list: p.start() # wait for solved event to be set by any worker thread. self.solved.wait() # Terminate and join all processes for p in self.process_list: p.terminate() p.join() print("Joined in", (time() - start_time) / 60, "mins") return bytearray(self.result)