def generate_matrices_remote(self, containerid: str = None, feats: int = 10, words: int = 6, vectors_path: str = None, docs_per_feat: int = 0, feats_per_doc: int = 3): """ Generating matrices on the remote server. This is used when nlp lives on its own machine. """ container = Container.get_object(pk=containerid) FeaturesStatus.set_status_feats( containerid=container.pk, busy=True, feats=feats, ) kwds = { 'containerid': containerid, 'feats': int(feats), 'words': words, 'docs_per_feat': int(docs_per_feat), 'feats_per_doc': int(feats_per_doc), 'path': container.get_folder_path(), } if os.path.isfile(vectors_path): celery.send_task(NLP_TASKS['factorize_matrices'], kwargs=kwds) else: celery.send_task(NLP_TASKS['compute_matrices'], kwargs=kwds)
def monitor_crawl(containerid: int = None, crawlid: str = None): """This task takes care of the crawl callback. The first parameter is empty becasue it is called as a linked task receiving a list of endpoints from the scrapper. """ celery.send_task(RMXWEB_TASKS['crawl_metrics'], kwargs={'containerid': containerid}, link=process_crawl_resp.s(containerid, crawlid))
def delete_data_from_container(containerid: str = None, data_ids: List[int] = None): """ :param containerid: :param data_ids: :return: """ container = Container.get_object(containerid) DataModel.delete_many(data_ids=data_ids, containerid=containerid) if container.matrix_exists: celery.send_task(RMXWEB_TASKS['integrity_check'], kwargs={'containerid': containerid})
def integrity_check(containerid: str = None): """ Checks the integrity of the container after the crawler finishes. :param containerid: :return: """ obj = Container.get_object(pk=containerid) obj.set_integrity_check_in_progress() celery.send_task(NLP_TASKS['integrity_check'], kwargs={ 'containerid': containerid, 'path': obj.get_folder_path(), })
def get_features(containerid: int = None, container=None, feats: int = None, words: int = None, **_): """ :param containerid: :param container: :param feats: :param words: :return: """ resp = celery.send_task(NLP_TASKS['retrieve_features'], kwargs={ 'containerid': containerid, 'feats': feats, 'path': container.get_folder_path(), 'words': words }).get() if resp: return {'success': True, 'data': resp} else: return { 'success': False, 'msg': f'no features for feature number {feats}' }
def get_available_features(containerid: int = None, folder_path: str = None): """Retrieves available features from nlp""" return celery.send_task(NLP_TASKS['available_features'], kwargs={ 'corpusid': containerid, 'path': folder_path }).get()
def crawl_async(url_list: list = None, containerid=None, depth=1): """Starting the crawler in scrasync. Starting the task that will monitor the crawler. """ crawlid = celery.send_task(SCRASYNC_TASKS['launch_crawl'], kwargs={ 'endpoint': url_list, 'containerid': containerid, 'depth': depth }).get() # the countdown argument is here to make sure that this task does not # start immediately as prometheus may be empty. celery.send_task(RMXWEB_TASKS['monitor_crawl'], kwargs={ 'containerid': containerid, 'crawlid': crawlid }, countdown=CRAWL_START_MONITOR_COUNTDOWN) return crawlid
def wrapped_view(containerid: int = None, words: int = 10, features: int = 10, docsperfeat: int = 5, featsperdoc: int = 3, **kwds): container = Container.get_object(pk=containerid) availability = container.features_availability(feature_number=features) out = { 'busy': True, 'retry': True, 'success': False, 'available': False, 'features': features, 'containerid': container.pk } if availability.get('busy'): return out if availability.get('available'): out = { 'words': words, 'feats': features, 'docs_per_feat': docsperfeat, 'feats_per_doc': featsperdoc, 'container': container, 'containerid': container.pk } out.update(kwds) return func(**out) celery.send_task(config.RMXWEB_TASKS['generate_matrices_remote'], kwargs={ 'containerid': container.pk, 'feats': features, 'vectors_path': container.get_vectors_path(), 'words': words, 'docs_per_feat': docsperfeat, 'feats_per_doc': featsperdoc }) out.update(availability) return out
def process_crawl_resp(resp, containerid, crawlid): """ Processing the response of the crawler. This task checks if the crawl is ready and if it finished. If yes, the integrity_check is called. This task processes the response form crawl_metrics. :param resp: :param containerid: :return: """ crawl_status = Container.container_status(containerid) if resp.get('ready'): celery.send_task(SCRASYNC_TASKS['delete_crawl_status'], kwargs={ 'containerid': containerid, 'crawlid': crawlid }) container = Container.get_object(pk=containerid) container.set_crawl_ready(value=True) if not crawl_status['integrity_check_in_progress']: celery.send_task(RMXWEB_TASKS['integrity_check'], kwargs={'containerid': containerid}) else: celery.send_task(RMXWEB_TASKS['monitor_crawl'], args=[containerid], countdown=CRAWL_MONITOR_COUNTDOWN)
def search_texts(words: typing.List[str] = None, highlight: bool = None, path: str = None) -> dict: """ Searching a collection of texts for a list of words. :param words: :param highlight: :param path: :return: """ return celery.send_task(RMXGREP_TASK['search_text'], kwargs={ 'highlight': highlight, 'words': words, 'container_path': path, }).get()
def hierarchical_tree(containerid=None, flat: bool = None, container=None, **_) -> dict: """ :param containerid: :param container: :param flat: :return: """ return celery.send_task(NLP_TASKS['hierarchical_tree'], kwargs={ 'containerid': containerid, 'flat': flat, }).get(timeout=3)
def get_features(feats: int = 10, words: int = 6, containerid: int = None, path: str = None, docs_per_feat: int = 0, feats_per_doc: int = 3): """ Getting the features from nlp. This will call a view method that will retrieve or generate the requested data. """ return celery.send_task(NLP_TASKS['features_and_docs'], kwargs={ 'path': path, 'feats': feats, 'containerid': containerid, 'words': words, 'docs_per_feat': docs_per_feat, 'feats_per_doc': feats_per_doc }).get()