def run(self, gen_dict_path: str, bio_dict_path: str,
            sentences: Dict[str, str]):
        gen_dict = {
            row[0]: DICT_MAPPING.get(row[1], row[1])
            for _, row in pd.read_csv(gen_dict_path, sep="\t",
                                      header=1).iterrows()
        }
        bio_dict = {
            row[0]: DICT_MAPPING.get(row[1], row[1])
            for _, row in pd.read_csv(bio_dict_path, sep="\t",
                                      header=1).iterrows()
        }

        dict_mappings = {**bio_dict, **gen_dict}

        batch_count = multiprocessing.cpu_count()
        batch_size = math.ceil(len(sentences) / batch_count)

        logger.info(f"Batch_size = {batch_size}")
        logger.info(f"Number of batches = {batch_count}")

        batches = create_dict_chunks(sentences, batch_size)

        batch_results = ray.get([
            self.process_data.remote(pos, dict_mappings, batch)
            for pos, batch in enumerate(batches)
        ])
        results = reduce(lambda x, y: {**x, **y}, batch_results, {})
        return results
    def run(
        self, paragraphs: Dict[str, str], coref_model_path: str, batch_size=8
    ) -> Dict[str, str]:
        """Run SRL extraction
        
        Args:
            paragraphs (Dict[str,str]): id: {paragraph}
        
        Returns:
            Dict[str, str]: return output id: paragraph with coreference resolution
        """

        if torch.cuda.is_available():
            logger.info("GPU found")
            logger.info("Initializing Coreference predictor with GPU")
            predictor = Predictor.from_path(coref_model_path, cuda_device=0)
        else:
            logger.info("Initializing Coreference predictor with CPU")
            predictor = Predictor.from_path(coref_model_path)

        logger.info(f"Batch_size = {batch_size}")
        batches = create_dict_chunks(paragraphs, batch_size)

        resolved_values = {}
        for batch in tqdm(
            batches,
            desc="Running coreference resolution",
            total=math.ceil(len(paragraphs) / batch_size),
        ):
            resolved_values = {
                **resolved_values,
                **{
                    id: val
                    for id, val in zip(
                        batch.keys(),
                        predictor.predict_batch_json(
                            [{"document": paragraph} for paragraph in batch.values()]
                        ),
                    )
                },
            }
        logger.success("Coreference resolution successful")

        logger.info("Resolving Coreference")
        results = {}

        for key, res in resolved_values.items():
            new_paragraph = self.coref_sub(res["document"], res["clusters"])
            results[key] = " ".join(new_paragraph)

        return results
Esempio n. 3
0
    def run(self, sentences: Dict[str, str],
            model_path: str) -> Dict[str, List]:
        """

        Perform discourse simplification

        Arguments:
            sentences {Dict[str, str]} -- [dictionary of sentences]
            parameters {List[Dict]} -- {port, host and workers}

        Returns:
            Dict[str, List] -- service result
        """

        logger.info(f"Loading model from {model_path}")
        if not os.path.isfile(model_path):
            logger.error(f"Discourse model jar not found in {model_path}")
            raise FileNotFoundError

        batch_count = multiprocessing.cpu_count()
        batch_size = math.ceil(len(sentences) / batch_count)

        logger.info(f"Batch_size = {batch_size}")
        logger.info(f"Number of batches = {batch_count}")

        batches = create_dict_chunks(sentences, batch_size)

        logger.info("Running service.")

        batch_results = ray.get([
            self.process_data.remote(pos, model_path, batch)
            for pos, batch in enumerate(batches)
        ])
        results = {
            id: ray.get(o_id)
            for id, o_id in reduce(lambda x, y: {
                **x,
                **y
            }, batch_results, {}).items()
        }

        simplification_output = {
            id: self.build_representation(result, sentences[id])
            for id, result in results.items()
        }
        return simplification_output
Esempio n. 4
0
    def run(self,
            input,
            fn,
            fn_args,
            is_parallel=True,
            batch_count=-1,
            **kwargs):
        start = time.time()
        if is_parallel == True:
            try:
                ray.init(ignore_reinit_error=True)
            except PermissionError:
                logger.warning(
                    "Unable to create temp in /tmp directoy due to PermissionError. Creating it locally"
                )
                ray.init(ignore_reinit_error=True, temp_dir="~/tmp")

            remote_fn = ray.remote(fn)

            batch_count = (multiprocessing.cpu_count()
                           if batch_count == -1 else batch_count)
            batch_size = math.ceil(len(input) / batch_count)

            logger.info(f"Batch_size = {batch_size}")
            logger.info(f"Number of batches = {batch_count}")

            batches = create_dict_chunks(input, batch_size)

            logger.info("Running Ray Executor")

            batch_results = ray.get([
                remote_fn.remote(pos=pos, input=batch, **fn_args)
                for pos, batch in enumerate(tqdm(batches))
            ])

            combined_results = reduce(lambda x, y: {
                **x,
                **y
            }, batch_results, {})
        else:
            combined_results = fn(pos=0, input=input, **fn_args)
        end = time.time()
        logger.info(f"Processing time: {end-start}")
        return combined_results
Esempio n. 5
0
    def run(self,
            query_input: Dict,
            ix,
            limit=None,
            no_parallel=False,
            **kwargs):
        vectorizer, transformed_corpus, ids = ix

        start = time.time()
        if not no_parallel:
            batch_count = kwargs.pop("batch_count",
                                     multiprocessing.cpu_count())
            batch_size = math.ceil(len(query_input) / batch_count)

            logger.info(f"Batch_size = {batch_size}")
            logger.info(f"Number of batches = {batch_count}")

            batches = create_dict_chunks(query_input, batch_size)

            logger.info("Querying")
            remote_fn = ray.remote(self.query)
            batch_results = ray.get([
                remote_fn.remote(
                    query_input=batch,
                    vectorizer=vectorizer,
                    transformed_corpus=transformed_corpus,
                    ids=ids,
                    limit=limit,
                ) for pos, batch in enumerate(tqdm(batches))
            ])
            query_output = reduce(lambda x, y: {**x, **y}, batch_results, {})
            end = time.time()
            logger.success(f"Query successful. Time taken: {end-start}")
        else:
            query_output = self.query(
                query_input=query_input,
                vectorizer=vectorizer,
                transformed_corpus=transformed_corpus,
                ids=ids,
                limit=limit,
            )

        return query_output
Esempio n. 6
0
    def run(self, text_input, is_fact=False):
        batch_count = multiprocessing.cpu_count()
        batch_size = math.ceil(len(text_input) / batch_count)

        logger.info(f"Batch_size = {batch_size}")
        logger.info(f"Number of batches = {batch_count}")

        batches = create_dict_chunks(text_input, batch_size)

        logger.info("Entity Extraction")
        start = time.time()

        batch_results = ray.get([
            self.process_batch.remote(text_input=batch)
            for pos, batch in enumerate(tqdm(batches))
        ])

        extracted_entites = reduce(lambda x, y: {**x, **y}, batch_results, {})
        end = time.time()
        logger.success(
            f"Enitity Extraction successful. Time taken: {end-start}")
        return extracted_entites
Esempio n. 7
0
    def run(self, parameters: List[Dict], end_point: str,
            input_parsed: Dict) -> List:
        """Run the service being requested using the end point and with request_input.
        
        Arguments:
            Task {[type]} -- [description]
            parameters {List[Dict]} -- List of Dict[{port, host and number of workers}]
            end_point {String} -- string to end_point
            input_parsed {Dict} -- input that will be sent as request
        
        Returns:
            List -- list of results from the service
        """
        ray.init()

        api_urls = [
            f"http://{parameters['host']}:{p}/{end_point}"
            for p in parameters["port"]
        ]

        logger.info(f"End_point = {end_point}")

        batch_size = math.ceil(len(input_parsed) / len(api_urls))

        logger.info(f"Batch_size = {batch_size}")
        logger.info(f"Number of batches = {len(api_urls)}")

        batches = create_dict_chunks(input_parsed, batch_size)

        logger.info("Running service.")

        batch_results = ray.get([
            self.service_post.remote(pos, url, batch)
            for pos, (url, batch) in enumerate(zip(api_urls, batches))
        ])

        return reduce(lambda x, y: {**x, **y}, batch_results, {})