Example #1
0
def delete_repo(to_delete):
    '''
	deletes the repository(github issues and git commits) from the dashboard by deleting the 
	elasticsearch data.
	:param to_delete : the url of the repository to be deleted
	'''
    print("Repository", to_delete, "will be deleted")
    s = Search(using = es, index = "git_test", doc_type="items").\
     query("match", origin=to_delete + ".git")

    response = s.delete()

    s = Search(using = es, index = "git_test-raw", doc_type="items").\
     query("match", origin=to_delete + ".git")

    response = s.delete()

    s = Search(using = es, index = "github_test", doc_type="items").\
     query("match", origin=to_delete )
    response = s.delete()


    s = Search(using = es, index = "github_test-raw", doc_type="items").\
     query("match", origin=to_delete )
    response = s.delete()
Example #2
0
    def delete_by_repo(self, repo_id):
        """Delete all the docs of a repo.

        SQL: delete from repofiles where repo='xxx'
        """
        s = Search(using=self.es, index=self.INDEX_NAME).query('term',
                                                               repo=repo_id)
        s.delete()
Example #3
0
 def _delete_distribution_data(self, distribution):
     fields_to_delete = list(
         distribution.field_set.filter(present=True).exclude(
             identifier=None).values_list('identifier', flat=True))
     series_data = Search(using=self.elastic,
                          index=self.index._name).filter(
                              'terms', series_id=fields_to_delete)
     series_data.delete()
Example #4
0
    def delete_by_repo_path_prefix(self, repo_id, path_prefix):
        """Delete docs of dirs and all files/sub-dirs in those dirs of a repo.

        SQL: delete from repofiles where repo='xxx' and path like '/dir_xxx/%'
        """
        s = Search(using=self.es, index=self.INDEX_NAME).query(
            'term', repo=repo_id).query('prefix', path=path_prefix)
        s.delete()
Example #5
0
 def _delete_distribution_data(self, distribution):
     fields_to_delete = list(
         SeriesRepository.get_present_series(distribution=distribution)
         .exclude(identifier=None)
         .values_list('identifier', flat=True)
     )
     for field in fields_to_delete:
         series_data = Search(using=self.elastic,
                              index=self.index._name).params(conflicts='proceed').filter('term', series_id=field)
         series_data.delete()
def delete(properties, index='data_objects'):
    """
    delete item from index
    """
    s = Search(using=client, index=index)
    clauses = []
    for k in properties.keys():
        v = properties[k]
        clauses.append('+{}:"{}"'.format(k, v))
    s = s.query("query_string", query=' '.join(clauses))
    s.delete()
Example #7
0
def init_tm_index(**kwargs):
    from elasticsearch_dsl import Search

    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_TOPIC_MODELLING
    from mainapp.documents import TopicModellingIndex, DynamicTopicModellingIndex

    kwargs = kwargs.copy()
    corpus = kwargs['corpus']
    kwargs['is_multi_corpus'] = True
    if type(corpus) != list:
        corpus = [corpus]
        kwargs['is_multi_corpus'] = False
    source = kwargs['source']
    datetime_from = kwargs['datetime_from']
    datetime_to = kwargs['datetime_to']
    is_dynamic = 'is_dynamic' in kwargs and kwargs['is_dynamic']

    # Check if already exists
    if not 'perform_actualize' in kwargs:
        s = Search(using=ES_CLIENT, index=kwargs['index_tm'])
        s = s.filter("term", name=kwargs['name'])
        s.delete()
        s = Search(using=ES_CLIENT, index=kwargs['index_tm'])
        s = s.filter("term", **{"name.keyword": kwargs['name']})
        try:
            s.delete()
        except:
            pass
    else:
        return get_tm_index(**kwargs)

    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpus)
    if source:
        s = s.filter("term", **{"source": source})
    if datetime_from:
        s = s.filter('range', datetime={'gte': datetime_from})
    if datetime_to:
        s = s.filter('range', datetime={'lt': datetime_to})
    number_of_documents = s.count()

    kwargs["number_of_documents"] = number_of_documents
    kwargs["is_ready"] = False
    kwargs['corpus'] = "_".join(corpus)
    if is_dynamic:
        index = DynamicTopicModellingIndex(**kwargs)
    else:
        index = TopicModellingIndex(**kwargs)
    index.save()
    return index
Example #8
0
def remove_cropped_if_asset_exists(asset):
    try:
        search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) +
                              cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta))
        search.query = Q('match', asset_id=asset.asset_id)
        search.exclude()
        for hit in search:
            idx = '{}-{}'.format(asset.asset_id, hit.cropped_id)
            s = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) +
                             cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped))
            s.query = Q('match', id=idx)
            s.delete()
        search.delete()
    except:
        print(sys.exc_info()[0])
Example #9
0
def test3_delete():
    '''

    :return:
    '''
    s = Search(using=client, index='test-index').query('match', sport='gaming')
    get_dsl(s)
    # print '检索匹配记录:'
    # response = s.execute()
    # for hit in response:
    #     get_readable_rs(hit.to_dict())
    # get_dsl(s)
    print '删除记录:'
    s.delete()
    print '删除后记录:'
    response = s.execute()
    for hit in response:
        get_readable_rs(hit.to_dict())
Example #10
0
    def handle(self, *args, **options):
        self.get_tm_ids()

        IpcAppList.objects.filter(id__in=self.ids).delete()

        print(len(self.ids))
        print(self.ids)
        print(self.app_nums)

        es = Elasticsearch(settings.ELASTIC_HOST, timeout=settings.ELASTIC_TIMEOUT)
        for _id in self.ids:
            q = Q(
                'bool',
                must=[Q('match', _id=_id)],
            )
            s = Search(index=settings.ELASTIC_INDEX_NAME).using(es).query(q)
            s.delete()

        self.stdout.write(self.style.SUCCESS('Finished'))
def delete_rows(client, index, date, path_log):
    try:
        s = Search(using=client, index=index) \
        .filter('range' ,  **{'Date': {'gte': date, "lte": date}})
        response = s.delete()
    except:
        error = sys.exc_info()
        simple_log(path_log, index, error[0], error[1])

    return None
    def delete_all(self, **kwargs):
        output = {'success': False}
        index = kwargs.get('index')
        if index is None:
            msg = "Error: 'index' argument is required"
            output['msg'] = msg
            return output
        if not index_exists(index):
            msg = "Error: index {} does not exist".format(index)
            output['msg'] = msg
            return output

        cherrypy.log('delete_all in index {}'.format(index))
        s = Search(index=index)
        s.delete()
        s = Search(index=token_index_name)
        s.delete()
        output['success'] = True
        return output
Example #13
0
    def clear_topics_from_topic_index(self, topic):
        should = []
        if isinstance(topic, list):
            should.extend([Q('match', topic_id=t) for t in topic])
        else:
            should.append(Q('match', topic_id=topic))

        q = Q('bool', should=should)
        s = Search(using=self.client, index=self.topic_index) \
            .query(q)

        response = s.delete()
        return response.success()
Example #14
0
    def test_delete(self):
        person = Person(id="1",
                        name="唐僧",
                        age=66,
                        create_time=datetime.now(),
                        desc="desc",
                        meta={'id': 42})
        self.assertEqual(42, person.meta.id)
        person.save(using=es)

        # 插入几条测试数据
        Person(id="2", name="张三", age=15,
               create_time="2013-09-10 23:40:00").save(using=es)
        Person(id="3", name="李四", age=16,
               create_time="2013-10-10 23:40:00").save(using=es)
        Person(id="4", name="王五", age=17,
               create_time="2013-11-10 23:40:00").save(using=es)
        Person._index.refresh(using=es)

        # 单个删除
        p = Person.get(id=42, using=es, ignore=[400, 404])
        p.delete(using=es)
        self.assertIsNone(Person.get(id=42, using=es,
                                     ignore=[400, 404
                                             ]))  # 添加ignore, 否则当没有该文档时,会抛出异常

        # 批量删除方式一
        s = Search(using=es, index=Person._index._name).filter('term',
                                                               name="张三")
        res = s.delete()
        self.assertEqual(1, res.deleted)

        # 批量删除方式二
        s = Search(using=es, index=Person._index._name).filter('term',
                                                               name="李四")
        res = [
            hit
            for hit in scan(es, query=s.to_dict(), index=Person._index._name)
        ]
        for r in res:
            r['_op_type'] = 'delete'
        bulk(es, res, params={"refresh": 'true'})

        # 验证结果
        total = Search(using=es, index=Person._index._name).count()
        self.assertEqual(1, total)
Example #15
0
def delete_repository(arr):
    es = Elasticsearch('http://localhost:9200', verify_certs=False)

    s = Search(using=es, index='git_test-raw')
    s.aggs.bucket('repository', 'terms', field='origin')
    result = s.execute()
    buckets_result = result['aggregations']['repository']['buckets']

    i = -1
    for repo in buckets_result:
        i = i + 1
        print(i, repo.key)
    index = int(
        input("\nEnter the Index of repository which you want to delete\n"))
    delete_repo = buckets_result[index]['key']

    for i in arr:
        print(i)
        s = Search(using=es, index=i).query("match", origin=delete_repo)
        response = s.delete()
Example #16
0
    def _delete_all(self, criteria: Q = None):
        """Delete all records matching criteria from the Repository"""
        conn = self._get_session()

        # Build the filters from the criteria
        q = elasticsearch_dsl.Q()
        if criteria and criteria.children:
            q = self._build_filters(criteria)

        s = Search(using=conn, index=self.model_cls._index._name).query(q)

        # Return the results
        try:
            response = s.delete()

            # `Search.delete` does not refresh index, so we have to manually refresh
            index = Index(name=self.entity_cls.meta_.schema_name, using=conn)
            index.refresh()
        except Exception as exc:
            logger.error(f"Error while deleting records: {exc}")
            raise

        return response.deleted
Example #17
0
def elasticsearch_delete_old():
    _from = NEVER
    _to = datetime.now() - timedelta(days=30)
    query = Search().filter(Q("range", visited_at={'from': _from, 'to': _to}))
    result = query.delete()
Example #18
0
 def delete(self, index, document_id):
     s = Search(index=index).query("match", _id=document_id)
     response = s.delete()
     return response
def delete_docs_by_unique_key(
    client: Elasticsearch,
    key: str,
    value_list: list,
    task_id: str,
    index,
    refresh_after: bool = True,
    delete_chunk_size: int = 1000,
) -> int:
    """
    Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the
    ``values_list``.

    NOTE: This delete routine looks at just the index name given. If there are duplicate records across
    multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple
    indices, or this will need to be run once per index.

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        key (str): name of field in targeted elasticsearch index that should have a unique value for
            every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field)
        value_list (list): if key field has these values, the document will be deleted
        task_id (str): name of ES ETL job being run, used in logging
        index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation.
        refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in
            ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a
            rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling
            code. NOTE: This param will be ignored and a refresh will be attempted if this function
            errors-out during execution, in order to not leave un-refreshed deletes in the index.
        delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be
            less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally
            use ``config["partition_size"]`` (derived from --partition-size) to set this to a calibrated value. If not
            provided, uses 1000 as a safe default (10,000 resulted in some timeouts on a busy cluster).

    Returns: Number of ES documents deleted
    """
    start = perf_counter()

    if len(value_list) == 0:
        logger.info(
            format_log("Nothing to delete", action="Delete", name=task_id))
        return 0

    logger.info(
        format_log(f"Deleting up to {len(value_list):,} document(s)",
                   action="Delete",
                   name=task_id))
    if not index:
        raise RuntimeError("index name must be provided")

    if not _is_allowed_key_field_type(client, key, index):
        msg = (
            f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of '
            f"the allowed field types, or the field was not found in that index."
        )
        logger.error(format_log(msg=msg, action="Delete", name=task_id))
        raise RuntimeError(msg)

    if delete_chunk_size > 65536:
        # 65,536 is max number of terms that can be added to an ES terms filter query
        msg = (
            f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES "
            f"terms filter query")
        logger.error(format_log(msg=msg, action="Delete"))
        raise RuntimeError(msg)

    chunks_processed = 0
    deleted = 0
    is_error = False
    try:
        values_generator = chunks(value_list, delete_chunk_size)
        for chunk_of_values in values_generator:
            # Invoking _delete_by_query as per the elasticsearch-dsl docs:
            #   https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query
            # _refresh is deferred until the end of chunk processing
            q = Search(using=client,
                       index=index).filter("terms",
                                           **{key:
                                              chunk_of_values})  # type: Search
            # params:
            # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once
            # slices="auto": Will create parallel delete batches per shard
            q = q.params(conflicts="proceed", slices="auto")
            response = q.delete()
            # Some subtle errors come back on the response
            if response["timed_out"]:
                msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s"
                logger.error(format_log(msg=msg, action="Delete",
                                        name=task_id))
                raise RuntimeError(msg)
            if response["failures"]:
                fail_snippet = "\n\t\t" + "\n\t\t".join(
                    map(str, response["failures"][0:4])) + "\n\t\t" + "..."
                msg = f"Some docs failed to delete on cluster:{fail_snippet}"
                logger.error(format_log(msg=msg, action="Delete",
                                        name=task_id))
                raise RuntimeError(msg)
            logger.info(
                format_log(
                    f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} "
                    f"in {int(response['took'])/1000:.2f}s, "
                    f"and ignored {response['version_conflicts']:,} version conflicts",
                    action="Delete",
                    name=task_id,
                ))
            deleted += response["deleted"]
            chunks_processed += 1
    except Exception:
        is_error = True
        logger.exception(format_log("", name=task_id, action="Delete"))
        raise
    finally:
        if deleted > 0 and (refresh_after or is_error):
            if not is_error:
                refresh_msg = "Refreshing index so deletes take effect"
            else:
                refresh_msg = "Attempting index refresh while handling error so deletes take effect"
            logger.info(format_log(refresh_msg, action="Delete", name=task_id))
            client.indices.refresh(index=index)
        if chunks_processed > 1 or is_error:
            # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error)
            error_text = " before encountering an error" if is_error else ""
            duration = perf_counter() - start
            docs = f"document{'s' if deleted != 1 else ''}"
            msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}"
            logger.info(format_log(msg, action="Delete", name=task_id))

    return deleted
Example #20
0
def delete_by_id(id):
    client = connect_es()
    s = Search(using=client, index=ELASTIC_INDEX)
    s.query = Q('term', _id=id)
    return s.delete()
def make_apartments_sold_in_elastic() -> None:
    s_obj = Search(index="test-apartment").query(
        "match", apartment_state_of_sale=ApartmentStateOfSale.FOR_SALE
    )
    s_obj.delete()
    sleep(3)
Example #22
0
 def remove(cls, name):
     q = [w for w in name.split(' ')]
     pers = Search(index='softwareprofs').query("simple_query_string",
                                                query=' +'.join(q),
                                                fields=["name"])
     pers.delete()
Example #23
0
def delete_document(book_id):
    s = Search(index='book-index').query('match', index__id=book_id)
    s.delete()
Example #24
0
def deletePost(url):
    s = Search(index='blog').query('match', _id=url)
    response = s.delete()
    return response
# The api is chainable
s = Search().using(client).query('match', title='python')
# Send the request
response = s.execute()

# Requests are cached by default by the python client,
# subsequent calls to execute will not trigger additional
# requests being sent to Elasticsearch
# To force a request specify `ignore_cache=True` when sending
# a request


# Much like the Django orm we are familiar with, we can delete
# the documents matching a search by calling delete
s = Search().query('match', title='python')
response = s.delete()


#####################################
# QUERIES

# Query objects are a one-to-one mapping to ES quey DSL:
from elasticsearch_dsl.query import MultiMatch, Match

MultiMatch(query='python django', fields=['title', 'body'])
# {"mutli_match": {"query": "python django", "fields": ["title", "body"]}}

Match(title={'query': 'web framework', 'type': 'phrase'})
# {"match": {"title": {"query": "web framwork", "type": "phrase"})

Example #26
0
                        }
                    }
                }
            }
        }
    })
    paper_info_s = paper_info_s.source(['PaperId'])

    # Get number of query results
    results = paper_info_s[:NUM_PAPERS]
    papers = [x.PaperId for x in results.execute()]

    # Check if the paper has been seen before, and thus needs to be deleted
    checked_papers = last_papers.intersection(set(papers))
    if checked_papers:
        delete_info_s = Search(index='paper_info', using=client)
        delete_info_s = delete_info_s.query("match",
                                            PaperId=list(checked_papers))
        delete_info_s.delete()
    last_papers = set(papers).difference(checked_papers)
    papers = list(last_papers)

    print(papers)

    # Get updated information
    process_res, partial_res = paper_info_multiquery(papers)  #, force=True)

    # Generate cached entries
    cache_paper_info(process_res)
    cache_paper_info(partial_res, chunk_size=100)
Example #27
0
 def _delete(_by_filter):
     search = Search(index=self._index,
                     doc_type=self._mapping_type,
                     using=self._es_client)
     search = search.query(_by_filter)
     return search.delete()
Example #28
0
    partial_papers = [p for (p, t) in paper_ids if t == 'partial']

    print('[{}] -- Generate cache entries'.format(datetime.now()))
    complete_res, partial_res = paper_info_multiquery(
        complete_papers, query_filter=cache_allow,
        partial_updates=partial_papers ,recache=True)

    print('[{}] -- Add to cache'.format(datetime.now()))
    cache_paper_info(
        complete_res, additional_tag={'UpdateVersion': START_VERSION})
    cache_paper_info(
        partial_res, additional_tag={'UpdateVersion': START_VERSION})

    print('[{}] -- Remove old paper ids'.format(datetime.now()))
    res_ids = [p['PaperId'] for p in complete_res + partial_res]
    old_ids = [p for p in next(zip(*paper_ids)) if p not in res_ids]
    if len(old_ids) > 0:
        remove_s = Search(index='paper_info', using=client)
        remove_s = remove_s.query('terms', PaperId=old_ids)
        remove_s.delete()

    print('[{}] - Finish batch {}\n'.format(datetime.now(), counter))
    counter += 1
    complete_updated += len(complete_res)
    partial_updated += len(partial_res)
    removed += len(old_ids)

    print('\n[{}] - Complete: {}, Partial: {}, Total: {}, Remove: {}\n'.format(
        datetime.now(), complete_updated, partial_updated, complete_updated +
        partial_updated, removed))