def run(self, gen_dict_path: str, bio_dict_path: str, sentences: Dict[str, str]): gen_dict = { row[0]: DICT_MAPPING.get(row[1], row[1]) for _, row in pd.read_csv(gen_dict_path, sep="\t", header=1).iterrows() } bio_dict = { row[0]: DICT_MAPPING.get(row[1], row[1]) for _, row in pd.read_csv(bio_dict_path, sep="\t", header=1).iterrows() } dict_mappings = {**bio_dict, **gen_dict} batch_count = multiprocessing.cpu_count() batch_size = math.ceil(len(sentences) / batch_count) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {batch_count}") batches = create_dict_chunks(sentences, batch_size) batch_results = ray.get([ self.process_data.remote(pos, dict_mappings, batch) for pos, batch in enumerate(batches) ]) results = reduce(lambda x, y: {**x, **y}, batch_results, {}) return results
def run( self, paragraphs: Dict[str, str], coref_model_path: str, batch_size=8 ) -> Dict[str, str]: """Run SRL extraction Args: paragraphs (Dict[str,str]): id: {paragraph} Returns: Dict[str, str]: return output id: paragraph with coreference resolution """ if torch.cuda.is_available(): logger.info("GPU found") logger.info("Initializing Coreference predictor with GPU") predictor = Predictor.from_path(coref_model_path, cuda_device=0) else: logger.info("Initializing Coreference predictor with CPU") predictor = Predictor.from_path(coref_model_path) logger.info(f"Batch_size = {batch_size}") batches = create_dict_chunks(paragraphs, batch_size) resolved_values = {} for batch in tqdm( batches, desc="Running coreference resolution", total=math.ceil(len(paragraphs) / batch_size), ): resolved_values = { **resolved_values, **{ id: val for id, val in zip( batch.keys(), predictor.predict_batch_json( [{"document": paragraph} for paragraph in batch.values()] ), ) }, } logger.success("Coreference resolution successful") logger.info("Resolving Coreference") results = {} for key, res in resolved_values.items(): new_paragraph = self.coref_sub(res["document"], res["clusters"]) results[key] = " ".join(new_paragraph) return results
def run(self, sentences: Dict[str, str], model_path: str) -> Dict[str, List]: """ Perform discourse simplification Arguments: sentences {Dict[str, str]} -- [dictionary of sentences] parameters {List[Dict]} -- {port, host and workers} Returns: Dict[str, List] -- service result """ logger.info(f"Loading model from {model_path}") if not os.path.isfile(model_path): logger.error(f"Discourse model jar not found in {model_path}") raise FileNotFoundError batch_count = multiprocessing.cpu_count() batch_size = math.ceil(len(sentences) / batch_count) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {batch_count}") batches = create_dict_chunks(sentences, batch_size) logger.info("Running service.") batch_results = ray.get([ self.process_data.remote(pos, model_path, batch) for pos, batch in enumerate(batches) ]) results = { id: ray.get(o_id) for id, o_id in reduce(lambda x, y: { **x, **y }, batch_results, {}).items() } simplification_output = { id: self.build_representation(result, sentences[id]) for id, result in results.items() } return simplification_output
def run(self, input, fn, fn_args, is_parallel=True, batch_count=-1, **kwargs): start = time.time() if is_parallel == True: try: ray.init(ignore_reinit_error=True) except PermissionError: logger.warning( "Unable to create temp in /tmp directoy due to PermissionError. Creating it locally" ) ray.init(ignore_reinit_error=True, temp_dir="~/tmp") remote_fn = ray.remote(fn) batch_count = (multiprocessing.cpu_count() if batch_count == -1 else batch_count) batch_size = math.ceil(len(input) / batch_count) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {batch_count}") batches = create_dict_chunks(input, batch_size) logger.info("Running Ray Executor") batch_results = ray.get([ remote_fn.remote(pos=pos, input=batch, **fn_args) for pos, batch in enumerate(tqdm(batches)) ]) combined_results = reduce(lambda x, y: { **x, **y }, batch_results, {}) else: combined_results = fn(pos=0, input=input, **fn_args) end = time.time() logger.info(f"Processing time: {end-start}") return combined_results
def run(self, query_input: Dict, ix, limit=None, no_parallel=False, **kwargs): vectorizer, transformed_corpus, ids = ix start = time.time() if not no_parallel: batch_count = kwargs.pop("batch_count", multiprocessing.cpu_count()) batch_size = math.ceil(len(query_input) / batch_count) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {batch_count}") batches = create_dict_chunks(query_input, batch_size) logger.info("Querying") remote_fn = ray.remote(self.query) batch_results = ray.get([ remote_fn.remote( query_input=batch, vectorizer=vectorizer, transformed_corpus=transformed_corpus, ids=ids, limit=limit, ) for pos, batch in enumerate(tqdm(batches)) ]) query_output = reduce(lambda x, y: {**x, **y}, batch_results, {}) end = time.time() logger.success(f"Query successful. Time taken: {end-start}") else: query_output = self.query( query_input=query_input, vectorizer=vectorizer, transformed_corpus=transformed_corpus, ids=ids, limit=limit, ) return query_output
def run(self, text_input, is_fact=False): batch_count = multiprocessing.cpu_count() batch_size = math.ceil(len(text_input) / batch_count) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {batch_count}") batches = create_dict_chunks(text_input, batch_size) logger.info("Entity Extraction") start = time.time() batch_results = ray.get([ self.process_batch.remote(text_input=batch) for pos, batch in enumerate(tqdm(batches)) ]) extracted_entites = reduce(lambda x, y: {**x, **y}, batch_results, {}) end = time.time() logger.success( f"Enitity Extraction successful. Time taken: {end-start}") return extracted_entites
def run(self, parameters: List[Dict], end_point: str, input_parsed: Dict) -> List: """Run the service being requested using the end point and with request_input. Arguments: Task {[type]} -- [description] parameters {List[Dict]} -- List of Dict[{port, host and number of workers}] end_point {String} -- string to end_point input_parsed {Dict} -- input that will be sent as request Returns: List -- list of results from the service """ ray.init() api_urls = [ f"http://{parameters['host']}:{p}/{end_point}" for p in parameters["port"] ] logger.info(f"End_point = {end_point}") batch_size = math.ceil(len(input_parsed) / len(api_urls)) logger.info(f"Batch_size = {batch_size}") logger.info(f"Number of batches = {len(api_urls)}") batches = create_dict_chunks(input_parsed, batch_size) logger.info("Running service.") batch_results = ray.get([ self.service_post.remote(pos, url, batch) for pos, (url, batch) in enumerate(zip(api_urls, batches)) ]) return reduce(lambda x, y: {**x, **y}, batch_results, {})