Ejemplo n.º 1
0
def insert_document_to_index(documents, text_an, index, keep):
    client = Elasticsearch()

    idx = Index(index, using=client)

    if idx.exists() and not keep:
        print('Removing existing index...')
        idx.delete()

    if not idx.exists():
        print('Creating index')
        idx.create()

    idx.close()
    idx.analyzer(text_an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print("Index settings=", idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
Ejemplo n.º 2
0
def test_complete(sch, database, elasticsearch):
    """A test of the complete functioning of the pipeline."""
    pipeline = sch.pipelines.get(name='Tour de France Data to Elasticsearch')

    SAMPLE_DATA = [dict(year=1903, rank=1, name='MAURICE GARIN', number=1, team='TDF 1903',
                        time='94h 33m 14s', hours=94, mins=33, secs=14),
                   dict(year=1903, rank=2, name='LUCIEN POTHIER', number=37, team='TDF 1903',
                        time='97h 32m 35s', hours=97, mins=32, secs=35),
                   dict(year=1903, rank=3, name='FERNAND AUGEREAU', number=39, team='TDF 1903',
                        time='99h 02m 38s', hours=99, mins=2, secs=38)]
    EXPECTED_RECORDS = [dict(year=1903, rank=1, firstName='Maurice', lastName='Garin', number=1, team='TDF 1903',
                             time='94h 33m 14s', hours=94, mins=33, secs=14),
                        dict(year=1903, rank=2, firstName='Lucien', lastName='Pothier', number=37, team='TDF 1903',
                             time='97h 32m 35s', hours=97, mins=32, secs=35),
                        dict(year=1903, rank=3, firstName='Fernand', lastName='Augereau', number=39, team='TDF 1903',
                             time='99h 02m 38s', hours=99, mins=2, secs=38)]

    table_name = get_random_string()
    index = get_random_string(string.ascii_lowercase)

    table = sqlalchemy.Table(table_name,
                             sqlalchemy.MetaData(),
                             sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True),
                             sqlalchemy.Column('year', sqlalchemy.Integer),
                             sqlalchemy.Column('rank', sqlalchemy.Integer),
                             sqlalchemy.Column('name', sqlalchemy.String(100)),
                             sqlalchemy.Column('number', sqlalchemy.Integer),
                             sqlalchemy.Column('team', sqlalchemy.String(100)),
                             sqlalchemy.Column('time', sqlalchemy.String(100)),
                             sqlalchemy.Column('hours', sqlalchemy.Integer),
                             sqlalchemy.Column('mins', sqlalchemy.Integer),
                             sqlalchemy.Column('secs', sqlalchemy.Integer))
    try:
        logger.info('Creating table (%s) in database ...', table_name)
        table.create(database.engine)

        logger.info('Inserting sample data ...')
        connection = database.engine.connect()
        connection.execute(table.insert(), SAMPLE_DATA)

        runtime_parameters = dict(JDBC_CONNECTION_STRING=database.jdbc_connection_string,
                                  JDBC_USERNAME=database.username,
                                  JDBC_PASSWORD=database.password,
                                  ELASTICSEARCH_URI=f'{elasticsearch.hostname}:{elasticsearch.port}',
                                  ELASTICSEARCH_CREDENTIALS=f'{elasticsearch.username}:{elasticsearch.password}',
                                  ELASTICSEARCH_INDEX=index,
                                  TABLE_NAME_PATTERN=f'%{table_name}%')

        with sch.run_test_job(pipeline, runtime_parameters, data_collector_labels=sch.data_collector_labels) as job:
            time.sleep(10)
            data_in_elasticsearch = [hit.to_dict() for hit in elasticsearch.search(index=index).sort('rank').execute()]
            assert EXPECTED_RECORDS == data_in_elasticsearch
    finally:
        index = Index(index, using=elasticsearch.client)
        if index.exists():
            logger.info('Deleting Elasticsearch index %s ...', index)
            index.delete()

        logger.info('Dropping table %s ...', table_name)
        table.drop(database.engine)
Ejemplo n.º 3
0
def _init_index(index_config, force):
    index = Index(index_config['name'])
    aliases = {}
    for alias_val in index_config['alias']:
        if isinstance(alias_val, basestring):
            aliases[alias_val] = {}
        else:
            aliases[alias_val['name']] = alias_val['config']
    index.aliases(**aliases)
    if force:
        index.delete(ignore=404)
    try:
        index.create()
    except TransportError as err:
        if err.status_code == 404:
            logger.debug('Index already exists, initializing document')
    index.close()

    for document_config in index_config['documents']:
        module_str, class_str = document_config['class'].rsplit('.', 1)
        module = import_module(module_str)
        cls = getattr(module, class_str)
        index.doc_type(cls)
        cls.init()
    index.open()

    return index
Ejemplo n.º 4
0
def setup_db():
    test_domain = initialize_domain()

    # Create all indexes
    from .elements import Person, Alien, User, ComplexUser, Provider
    test_domain.register(Person)
    test_domain.register(Alien)
    test_domain.register(User)
    test_domain.register(ComplexUser)
    test_domain.register(Provider)

    provider = test_domain.get_provider('default')
    conn = provider.get_connection()

    for _, aggregate_record in test_domain.aggregates.items():
        index = Index(aggregate_record.cls.meta_.schema_name, using=conn)
        if not index.exists():
            index.create()

    yield

    # Drop all indexes at the end of test suite
    for _, aggregate_record in test_domain.aggregates.items():
        index = Index(aggregate_record.cls.meta_.schema_name, using=conn)
        if index.exists():
            index.delete()
Ejemplo n.º 5
0
def user_index(app):
    """Initialize the `User` doc type."""
    test_index = Index(uuid4().hex)
    test_index.create()
    app.cluster.health(wait_for_status='yellow')

    # monkey patch `auth_index`
    original_auth_index = auth_models.auth_index
    auth_models.auth_index = test_index

    User.init(index=test_index._name)

    yield test_index

    auth_models.auth_index = original_auth_index
    # Remove all `User`s.
    #
    # [Don't use delete-by-query to clean out all or most documents in an
    # index. Rather create a new index...]
    # (https://www.elastic.co/guide/en/elasticsearch/plugins/2.2/plugins-delete-by-query.html)
    #
    # [It is no longer possible to delete the mapping for a type. Instead you
    # should delete the index and recreate it with the new mappings.]
    # (https://www.elastic.co/guide/en/elasticsearch/reference/2.2/indices-delete-mapping.html)
    test_index.delete()
Ejemplo n.º 6
0
 def _create_index(cls, index_name):
     new_index = Index(index_name, using=CONNECTION_ALIAS)
     new_index.delete(ignore=[400, 404])
     new_index.settings(index=DEFAULT_INDEX_SETTING)
     new_index.create()
     cls.init(index=index_name)
     return new_index
Ejemplo n.º 7
0
def create_index():
    index = Index(settings.INDEX)
    index.delete(ignore=404)
    for t in [Action, Contact, Run]:
        index.doc_type(t)
    index.create()
    load_flows()
Ejemplo n.º 8
0
def delete_failed_index(mdx, dic_pk):
    try:
        print('try delete failed index', get_index_name_with_pk(dic_pk))
        index = Index(get_index_name_with_pk(dic_pk))
        index.delete()
    except Exception as e:
        write_exception_error(mdx, dic_pk, e)
    def rebuild_platforms(self):
        galaxy_platforms = Index('galaxy_platforms')

        galaxy_platforms.doc_type(PlatformDoc)
        galaxy_platforms.delete(ignore=404)
        galaxy_platforms.create()

        for platform in Platform.objects.filter(
                active=True).distinct('name').all():
            alias_list = [
                alias
                for alias in self.get_platform_search_terms(platform.name)
            ]
            alias_list = '' if len(alias_list) == 0 else alias_list
            release_list = [
                p.release for p in Platform.objects.filter(
                    active=True, name=platform.name).order_by(
                        'release').distinct('release').all()
            ]
            search_name = 'Enterprise_Linux' if platform.name == 'EL' else platform.name
            doc = PlatformDoc(
                name=search_name,
                releases=release_list,
                roles=Role.objects.filter(
                    active=True, is_valid=True,
                    platforms__name=platform.name).order_by(
                        'namespace', 'name').distinct('namespace',
                                                      'name').count(),
                alias=alias_list,
                autocomplete="%s %s %s" %
                (search_name, ' '.join(release_list), ' '.join(alias_list)))
            doc.save()
Ejemplo n.º 10
0
def buildIndex():
    song_index = Index('song_index')
    if song_index.exists():
        song_index.delete()  # Overwrite any previous version
    song_index.doc_type(Song)  # Set doc_type to Movie
    song_index.create()

    # Open the json film corpus
    with open('./data/data_v3.json') as data_file:
        songs = json.load(data_file)
        size = len(songs)

    # Action series for bulk loading
    actions = [{
        "_index": "song_index",
        "_type": "song",
        "_id": mid,
        "title": songs[str(mid)]['title'],
        "description": songs[str(mid)]['description'],
        "type": songs[str(mid)]['type'],
        "song_name": songs[str(mid)]['song_name'],
        "album_name": (songs[str(mid)]['album_name']),
        "artist": (songs[str(mid)]['artist']),
        "rank": (songs[str(mid)]['rank']),
        "charts": songs[str(mid)]['charts'],
        "image_link": songs[str(mid)]['image_link'],
        "category": list2str(songs[str(mid)]['category']),
    } for mid in range(1, size + 1)]

    helpers.bulk(es, actions)
Ejemplo n.º 11
0
def es_delete_cmd(index_name):
    """Delete a specified index

    :arg index_name: name of index to delete

    """
    indexes = [name for name, count in get_indexes()]

    if index_name not in indexes:
        log.error('Index "%s" is not a valid index.', index_name)
        if not indexes:
            log.error('There are no valid indexes.')
        else:
            log.error('Valid indexes: %s', ', '.join(indexes))
        return

    ret = raw_input('Are you sure you want to delete "%s"? (yes/no) ' %
                    index_name)
    if ret != 'yes':
        return

    log.info('Deleting index "%s"...', index_name)
    index = Index(name=index_name, using='default')
    try:
        index.delete()
    except NotFoundError:
        pass
    log.info('Done!')
Ejemplo n.º 12
0
def create_delete_index(**kwargs):
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_EVAL
    from mainapp.documents import DocumentEval
    from util.util import shards_mapping
    from elasticsearch_dsl import Index

    crit_or_class_ids = kwargs['crit_or_class_ids']
    is_criterion = kwargs['is_criterion']
    perform_actualize = kwargs['perform_actualize']
    topic_modelling_name = kwargs['topic_modelling_name']
    scored_documents = kwargs['scored_documents']

    for crit_id in crit_or_class_ids:
        if not perform_actualize:
            es_index = Index(
                f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}",
                using=ES_CLIENT)
            es_index.delete(ignore=404)
        if not ES_CLIENT.indices.exists(
                f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}"
        ):
            settings = DocumentEval.Index.settings
            settings['number_of_shards'] = shards_mapping(
                scored_documents.shape[0])
            ES_CLIENT.indices.create(
                index=
                f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}",
                body={
                    "settings": settings,
                    "mappings": DocumentEval.Index.mappings
                })
Ejemplo n.º 13
0
def buildIndex():
    film_index = Index('sample_film_index')
    if film_index.exists():
        film_index.delete()  # Overwrite any previous version
    film_index.doc_type(Movie) # Set doc_type to Movie
    film_index.create()
    
    # Open the json film corpus
    with open('films_corpus.json') as data_file:
        movies = json.load(data_file)
        size = len(movies)
    
    # Action series for bulk loading
    actions = [
        {
            "_index": "sample_film_index",
            "_type": "movie",
            "_id": mid,
            "title":movies[str(mid)]['title'],
            "text":movies[str(mid)]['text'],
            "starring":movies[str(mid)]['starring'],
            "runtime": get_runtime(movies[str(mid)]['runtime']), #movies[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes)
            # --- Add more fields here ---
            "language": movies[str(mid)]['language'],
            "country": movies[str(mid)]['country'],
            "director": movies[str(mid)]['director'],
            "location": movies[str(mid)]['location'],
            "time": movies[str(mid)]['time'],
            "categories": movies[str(mid)]['categories']
        }
        for mid in range(1, size+1)
    ]
    
    helpers.bulk(es, actions)
Ejemplo n.º 14
0
def main(es_index_name, file_to_index):
    """
    Build an elasticsearch index over the Docterms data using bulk load
    """
    start_time = time.time()
    #buildIndex()

    index = Index(es_index_name)
    print "[main]After index(es_index_name)"

    if index.exists():
        index.delete()  # Overwrite any previous version

    #index.doc_type(DocTerms)
    index.doc_type(DocTerms)
    print "[main]After index.doc_type."

    #index.doc_type("doc_terms")
    index.create()

    print "[main]Created index."
    #exit()

    #docterms_create_index(es, es_index_name)
    stream = docterms_document_stream(file_to_index, es_index_name)
    print "[main]Calling bulk loader."
    helpers.bulk(es, stream)

    #for result in docterms_document_stream(file_to_index):
    #    print "result: %s" % (result)

    es.indices.refresh(index=es_index_name)
    print("[main]Built index in %s seconds ===" % (time.time() - start_time))
Ejemplo n.º 15
0
def addIndex(gI):
    f = "output" + str(gI)
    ftxt = codecs.open(f, "r", encoding='iso-8859-1')
    
    text = ''
    for line in ftxt:
        text += line
    # Insert operation for a document with fields' path' and 'text'
    ldocs = []
    ldocs.append({'_op_type': 'index', '_index': f, '_type': 'document', 'path': f, 'text': text})

    # Working with ElasticSearch
    client = Elasticsearch()
    try:
        # Drop index if it exists
        ind = Index(f, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    # Bulk execution of elasticsearch operations (faster than executing all one by one)
    print('Indexing ...')
    bulk(client, ldocs)
Ejemplo n.º 16
0
 def handle(self, *args, **kwargs):
     es = connections.get_connection()
     for name, model in MODELS:
         print('Processing %s:' % name)
         document = model.es_doc
         index_name = document._doc_type.index
         index = Index(index_name)
         index.settings(**settings.ES_INDEXES_SETTINGS)
         index.doc_type(document)
         print(' - Deleting index.')
         index.delete(ignore=404)
         print(' - Creating index.')
         index.create()
         total = model.objects.count()
         if total == 0:
             print(' - No %s to index.' % name)
             continue
         progress_bar = tqdm(
             total=total,
             bar_format=
             ' - Indexing: {n_fmt}/{total_fmt} [{elapsed} < {remaining}]',
             ncols=1,  # required to show the custom bar_format
         )
         for _ in streaming_bulk(
                 es,
             (obj.get_es_data() for obj in model.objects.all().iterator()),
                 index=index_name,
                 doc_type=document._doc_type.name,
         ):
             progress_bar.update(1)
         progress_bar.close()
Ejemplo n.º 17
0
class BaseSearchTestCase(TestCase):

    def setUp(self):
        from django.conf import settings
        SEARCH = getattr(settings, 'SEARCH')

        connections.create_connection('testing', **SEARCH['default']['connections'])
        self.index = Index(SEARCH['default']['index'], using='testing')
        # This is needed for test_documents, but has side effects in all running tests
        doctypes_list = (
            value for name, value
            in inspect.getmembers(documents)
            if not name.startswith('_') and
            inspect.isclass(value) and
            issubclass(value, DocType) and
            name != DocType.__name__
        )

        for doctype in doctypes_list:
            # Remove assigned index
            doctype._doc_type.index = None
            # Associate docs with test index
            self.index.doc_type(doctype)

        if self.index.exists():
            self.index.delete(ignore=404)
        self.index.create()

        self.search = Search(index=SEARCH['default']['index'])

    def tearDown(self):
        self.index.delete()
        queue = django_rq.get_queue()
        queue.empty()
Ejemplo n.º 18
0
def insert_documents_to_index(documents, an, index):
    client = Elasticsearch()
    idx = Index(index, using=client)
    if idx.exists():
        idx.delete()

    idx.settings(number_of_shards=1)
    idx.create()

    idx = Index(index, using=client)
    idx.close()
    idx.analyzer(an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print('Index settings=', idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
Ejemplo n.º 19
0
def build_query_Index():
    query_index = Index('query_index')
    query_index.document(SearchQuery)
    if query_index.exists():
        query_index.delete()
    query_index.create()
    SearchQuery.init()
Ejemplo n.º 20
0
def buildIndex():
    Disease_index = Index('test_rare_disease_index')
    if Disease_index.exists():
        Disease_index.delete()  # Overwrite any previous version
    Disease_index.doc_type(Disease)  # Set doc_type to Disease
    Disease_index.create()

    # Open the json film corpus
    with open('disease_data.json') as data_file:
        diseases = json.load(data_file)
        size = len(diseases)

    # Action series for bulk loading
    actions = [
        {
            "_index": "test_rare_disease_index",
            "_type": "disease",
            "_id": mid,
            "disease_type": diseases[str(mid)]['disease_type'],
            "name": diseases[str(mid)]['name'],
            "introduction": diseases[str(mid)]['introduction'],
            "symptoms": diseases[str(mid)]['symptoms'],
            "causes": diseases[str(mid)]['causes'],
            "treatment": diseases[str(mid)]['treatment'],
            "diagnosis": diseases[str(mid)]['diagnosis'],
            "affected_populations": diseases[str(mid)]['affected_populations'],

            #diseases[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes)
            # --- Add more fields here ---
        } for mid in range(1, size + 1)
    ]

    helpers.bulk(es, actions)
Ejemplo n.º 21
0
def setup_db():
    domain = initialize_domain()

    with domain.domain_context():
        # Create all indexes
        # Local/Relative Imports
        from .elements import Alien, ComplexUser, Person, User

        domain.register(Person)
        domain.register(Alien)
        domain.register(User)
        domain.register(ComplexUser)

        provider = domain.get_provider("default")
        conn = provider.get_connection()

        for _, aggregate_record in domain.registry.aggregates.items():
            index = Index(aggregate_record.cls.meta_.schema_name, using=conn)
            if not index.exists():
                index.create()

        yield

        # Drop all indexes at the end of test suite
        for _, aggregate_record in domain.registry.aggregates.items():
            index = Index(aggregate_record.cls.meta_.schema_name, using=conn)
            if index.exists():
                index.delete()
Ejemplo n.º 22
0
def es_delete_cmd(index_name):
    """Delete a specified index

    :arg index_name: name of index to delete

    """
    indexes = [name for name, count in get_indexes()]

    if index_name not in indexes:
        log.error('Index "%s" is not a valid index.', index_name)
        if not indexes:
            log.error('There are no valid indexes.')
        else:
            log.error('Valid indexes: %s', ', '.join(indexes))
        return

    ret = raw_input(
        'Are you sure you want to delete "%s"? (yes/no) ' % index_name
    )
    if ret != 'yes':
        return

    log.info('Deleting index "%s"...', index_name)
    index = Index(name=index_name, using='default')
    try:
        index.delete()
    except NotFoundError:
        pass
    log.info('Done!')
Ejemplo n.º 23
0
def create_index(index_name):
    """
	Create a new index in Elasticsearch. The new index will
	be identified by the value of 'index_name'.

	index_name: 
		name of index to insert documents. Valid
		arguments: projects, publications

	"""

    if index_name not in ['projects', 'publications', 'appdata']:
        raise (Exception(f"'{index_name}' is not a valid index name."))

    if index_name == 'projects':
        model = models.Project
    elif index_name == 'publications':
        model = models.Publication
    elif index_name == 'appdata':
        model = models.appdata
    else:
        raise (Exception)

    # initialize index
    idx = Index(index_name, using=client)
    # register a document with the index
    idx.document(model)
    # delete the index, ignore if it doesn't exist
    idx.delete(ignore=404)
    # create the index in elasticsearch
    idx.create()
Ejemplo n.º 24
0
def build_index():
    """
    Main function of this module. Build the covid relation index.
    :return: None
    """
    covid_index = Index('covid_relation_index')

    if covid_index.exists():
        # Overwrite any previous version
        covid_index.delete()

    covid_index.document(RelationDocument)

    covid_index.create()

    metadata = {}
    with open('data/relations.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)

        # For each value in the CSV file, create a dictionary entry to store
        # it's information in the appropriate place.
        for i, cols in enumerate(reader):
            metadata[str(i + 1)] = {}
            for key, col in zip(header, cols):
                # If the value type is triple, decompose the value
                # into arg subcomponents.
                if key == 'triple':
                    # Get the predicate and the arguments.
                    predicate, *args = col[2:-2].replace('\'', '').split(',')
                    metadata[str(i + 1)]['predicate'] = predicate
                    metadata[str(i + 1)]['arguments'] = args
                else:
                    metadata[str(i + 1)][key] = col

    def actions():
        for rel_id in range(1, len(metadata) + 1):
            dict_id = str(rel_id)
            try:
                yield {
                    '_index': 'covid_relation_index',
                    '_type': '_doc',
                    '_id': rel_id,
                    # The DOI allows us to link directly to the article's page where it's hosted.
                    'doi': metadata[dict_id]['doi'],
                    # The doc_id refers to the CORD-NER-corpus.json dataset. This field is unused in our web app.
                    'doc_id': metadata[dict_id]['doc_id'],
                    # Sent refers to the sentence from which the document was drawn.
                    'sent': metadata[dict_id]['sent'],
                    # Predicate refers to the predicate as explained in the RelationDocument class
                    'predicate': metadata[dict_id]['predicate'],
                    # Argument refers to the arguments as explained in the RelationDocument class
                    'arguments': metadata[dict_id]['arguments'],
                }
            except ValueError:
                continue
            except KeyError:
                continue

    helpers.bulk(es, actions())
def test_offset_upgrade(sdc_builder, sdc_executor, elasticsearch):
    """Ensure that when upgrading from older offset format (that can be generated by either SCH or by upgrading
       pre-multithreaded pipeline) we properly upgrade the offset and the pipeline will not re-read everything
       from the source.
    """
    es_index = get_random_string(string.ascii_letters, 10).lower()
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    builder = sdc_builder.get_pipeline_builder()
    es_origin = builder.add_stage('Elasticsearch', type='origin')
    es_origin.set_attributes(index=es_index,
                             query="{'query': {'match_all': {}}}")
    trash = builder.add_stage('Trash')

    es_origin >> trash
    pipeline = builder.build().configure_for_environment(elasticsearch)
    sdc_executor.add_pipeline(pipeline)

    # We hard code offset to be pre-migration to multi-threaded origin and thus forcing the origin to upgrade it
    offset = {
        'offsets': {
            '$com.streamsets.datacollector.pollsource.offset$': None,
        },
        'version': 2
    }
    sdc_executor.api_client.update_pipeline_committed_offsets(pipeline.id,
                                                              body=offset)

    try:
        # Put data to Elasticsearch
        elasticsearch.connect()
        doc_type = DocType(meta={'id': es_doc_id, 'index': es_index})
        doc_type.body = raw_str
        doc_type.save()  # save document to Elasticsearch
        index = Index(es_index)
        assert index.refresh(
        )  # assert to refresh index, making all operations available for search

        # Run pipeline and assert
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        # no need to stop pipeline - as ES origin shuts off once data is read from Elasticsearch
        snapshot_data = snapshot[es_origin.instance_name].output[0].field
        # assert ES meta
        assert snapshot_data['_index'] == es_index and snapshot_data[
            '_id'] == es_doc_id
        # assert ES data
        assert snapshot_data['_source']['body'] == raw_str

        # Now let's validate that the offset doesn't have the poll key any more
        offset = sdc_executor.api_client.get_pipeline_committed_offsets(
            pipeline.id).response.json()
        assert offset is not None
        assert '$com.streamsets.datacollector.pollsource.offset$' not in offset[
            'offsets']
    finally:
        # Clean up test data in ES
        idx = Index(es_index)
        idx.delete()
Ejemplo n.º 26
0
def blog_index():
    tmp_index = Index(uuid4().hex)
    tmp_index.create()

    yield tmp_index._name

    tmp_index.delete()
    def handle(self, *args, **options):
        companies = Index('companies')
        companies.delete(ignore=404)

        companies.doc_type(CompanyDocType)
        companies.analyzer(analyzer('english'))
        companies.create()
        management.call_command('populate_elasticsearch')
Ejemplo n.º 28
0
def migrate():
    hidden_services = Index('hiddenservices')
    hidden_services.delete(ignore=404)
    hidden_services = Index('hiddenservices')
    hidden_services.doc_type(DomainDocType)
    hidden_services.doc_type(PageDocType)
    hidden_services.settings(number_of_shards=8, number_of_replicas=1)
    hidden_services.create()
Ejemplo n.º 29
0
def set_up(name: str, class_name, create: bool = False):
    """Register mappings with index, optionally delete and create the index"""
    index = Index(name)
    index.document(class_name)

    if create:
        index.delete(ignore=404)
        index.create()
Ejemplo n.º 30
0
 def run(self, *args, **options):
     self.confirm(
         u"Are you really sure you want to delete the index '{0}' ?".format(
             self.index_name))
     index = Index(self.index_name)
     if not self.dry_run:
         index.delete()
     self.print_success(u"Index {0} deleted.".format(self.index_name))
def test_elasticsearch_pipeline_errors(sdc_builder, sdc_executor,
                                       elasticsearch):
    """Test for a pipeline's error records being pumped to Elasticsearch. We do so by making a Dev Raw Data source
    target to Error stage which would send records to the pipeline configured Elasticsearch error records handling.
    We then assert the error records what we find in Elasticsearch. The pipeline would look like:

    Elasticsearch error pipeline:
        dev_raw_data_source >> error_target
    """
    # Test static
    es_index = get_random_string(
        string.ascii_letters,
        10).lower()  # Elasticsearch indexes must be lower case
    es_mapping = get_random_string(string.ascii_letters, 10)
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    errstg = builder.add_error_stage('Write to Elasticsearch')
    errstg.set_attributes(document_id=es_doc_id,
                          index=es_index,
                          mapping=es_mapping)
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              stop_after_first_batch=True,
                                              raw_data=raw_str)
    error_target = builder.add_stage('To Error')

    dev_raw_data_source >> error_target
    es_error_pipeline = builder.build(
        title='ES error pipeline').configure_for_environment(elasticsearch)
    sdc_executor.add_pipeline(es_error_pipeline)

    try:
        elasticsearch.connect()

        # Make sure that the index exists properly before running the test
        index = Index(es_index)
        index.create()
        assert index.refresh()

        # Run pipeline and read from Elasticsearch to assert
        sdc_executor.start_pipeline(es_error_pipeline).wait_for_finished()

        # Since we are upsert on the same index, map, doc - there should only be one document (index 0)
        es_search = ESSearch(index=es_index)
        es_response = _es_search_with_retry(es_search)
        es_meta = es_response[0].meta
        # assert meta ingest
        assert es_meta['index'] == es_index and es_meta[
            'doc_type'] == es_mapping and es_meta['id'] == es_doc_id
        # assert data ingest
        assert raw_str == es_response[0].text
    finally:
        # Clean up test data in ES
        idx = Index(es_index)
        idx.delete()
Ejemplo n.º 32
0
def test_delete(write_client):
    write_client.indices.create(
        index='test-index',
        body={'settings': {'number_of_replicas': 0, 'number_of_shards': 1}}
    )

    i = Index('test-index', using=write_client)
    i.delete()
    assert not write_client.indices.exists(index='test-index')
Ejemplo n.º 33
0
def drop_index(silent=True):
    """Remove the ElasticSearch index.
    """
    index = Index(elasticsearch_config['index'])
    try:
        index.delete()
    except Exception as exc:
        if not silent:
            raise exc
 def run(self, *args, **options):
     self.confirm(
         u"Are you really sure you want to delete the index '{0}' ?"
         .format(self.index_name)
     )
     index = Index(self.index_name)
     if not self.dry_run:
         index.delete()
     self.print_success(u"Index {0} deleted.".format(self.index_name))
Ejemplo n.º 35
0
    def recreate_index(self):
        """ Delete and then create a given index and set a default mapping.

        :param index: [string] name of the index. If None a default is used
        """
        submission = Index(self.index)
        submission.delete(ignore=404)

        ESSubmission.init()
    def initialize_index(self, delete_if_exists=False):
        """
        Initialize index with mapping in ElasticSearch

        :param delete_if_exists: delete index, if exists
        :return: None
        """

        def update_index_settings():
            """
            Function updates settings for slovenian lemmatization of words.
            As far as we know, elasticsearch-dsl library does not support
            custom filter settings.

            :return: None
            """
            analysis_settings = {
                "analysis": {
                    "filter": {
                        "lemmagen_filter_sl": {
                            "type": "lemmagen",
                            "lexicon": "sl"
                        }
                    },
                    "analyzer": {
                        "lemmagen_sl": {
                            "type": "custom",
                            "tokenizer": "uax_url_email",
                            "filter": [
                                "lemmagen_filter_sl",
                                "lowercase"
                            ]
                        }
                    }
                }
            }
            self.client.cluster.health(index=self.index_name,
                                       wait_for_status='green',
                                       request_timeout=2)
            self.client.indices.close(index=self.index_name)
            self.client.indices.put_settings(json.dumps(analysis_settings),
                                             index=self.index_name)
            self.client.indices.open(index=self.index_name)

        index = Index(self.index_name, using=self.client)
        if delete_if_exists and index.exists():
            index.delete()

        index.settings(
            # use higher number in production
            number_of_replicas=0
        )

        # register models
        index.doc_type(Document)
        index.create()
        update_index_settings()  # set lemmanizer
Ejemplo n.º 37
0
    def test_create_index_manually(self):
        out = io.StringIO()
        index_name = 'test_manually_created_index'
        call_command('create_index', index_name, stdout=out)
        self.assertIn("Created search index '{}'".format(index_name), out.getvalue())

        index = Index(index_name)
        self.assertTrue(index.exists())

        index.delete()
        self.assertFalse(index.exists())
Ejemplo n.º 38
0
def create_search_index(index_name, doc_types=None, connection='default', delete_if_exists=False):
    index = Index(index_name, using=connection)
    if delete_if_exists:
        index.delete(ignore=404)
    if doc_types:
        for dt in doc_types:
            if isinstance(dt, str):
                dt = get_document_class(dt)
            index.doc_type(dt)
    if not index.exists():
        index.create()
    return index
Ejemplo n.º 39
0
    def test_create_index_usings_settings(self):
        out = io.StringIO()
        call_command('create_index', stdout=out)

        self.assertIn("Creating search indices from settings", out.getvalue())
        self.assertIn("Created search index '{}'".format(self.settings['default']['index']), out.getvalue())

        index = Index(self.settings['default']['index'])
        self.assertTrue(index.exists())

        index.delete()
        self.assertFalse(index.exists())
Ejemplo n.º 40
0
def create_indices(endpoint):
    """
    Creates constituent and address indices in PIC
    """
    connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True)
    pic_index = Index('pic')
    pic_index.doc_type(Constituent)
    pic_index.doc_type(Address)
    pic_index.delete(ignore=404)

    pic_index.settings(
        number_of_shards=5,
        number_of_replicas=2
    )
    pic_index.create()
Ejemplo n.º 41
0
def recreate_index():
    """Delete index if it's there and creates a new one"""
    index = Index(name=get_index_name(), using='default')

    for name, doc_type in get_doctypes().items():
        index.doc_type(doc_type)

    # Delete the index if it exists.
    try:
        index.delete()
    except NotFoundError:
        pass

    # Note: There should be no mapping-conflict race here since the
    # index doesn't exist. Live indexing should just fail.

    # Create the index with the mappings all at once.
    index.create()
Ejemplo n.º 42
0
    def build_search_index(cls, reset=True):
        if reset:
            index = ES_Index(cls._es_doctype._doc_type.index)
            index.delete(ignore=404)
            cls._es_doctype.init()


        def add_to_index(id_, db, app):
            with app.app_context():
                obj = db.session.query(cls).get(id_)
                obj.add_to_search_index()

        app = db.get_app()

        with futures.ThreadPoolExecutor(max_workers=10) as executor:
            future_to_id = dict((executor.submit(add_to_index, id_, db, app),
                                 id_) for id_ in xrange(1, cls.count() + 1))

            for future in futures.as_completed(future_to_id):
                id = future_to_id[future]
                if future.exception() is not None:
                    print('%r generated an exception: %s' % (
                        id, future.exception()))
Ejemplo n.º 43
0
class SearchByFieldTestCase(TestCase):

    def setUp(self):
        self.es_conn = connections.get_connection()
        self.test_crecs = []
        for i in range(20):
            self.test_crecs.append(
                CRECDoc(
                    title=str(i),
                    content='foo bar baz Foo',
                    date_issued=datetime(2017, 1, i % 5 + 1)
                )
            )
        self.index = Index(settings.ES_CW_INDEX)
        CRECDoc.init()
        for c in self.test_crecs:
            c.save(refresh=True)
        self.client = Client()

    def tearDown(self):
        self.index.delete()
    
    def test_search_by_title(self):
        c = CRECDoc(
            title='foo',
            content='blah',
            date_issued=datetime(2017, 1, 1)
        )
        c.save(refresh=True)
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'title': 'foo',
        }
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        results = response_content['data']
        self.assertEquals(1, len(results))
        self.assertEquals('foo', results[0]['title'])
        self.assertEquals('blah', results[0]['content'])
    
    def test_search_by_content(self):
        c = CRECDoc(
            title='foo',
            content='blah',
            date_issued=datetime(2017, 1, 1)
        )
        c.save(refresh=True)
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'content': 'blah',
        }
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        results = response_content['data']
        self.assertEquals(1, len(results))
        self.assertEquals('foo', results[0]['title'])
        self.assertEquals('blah', results[0]['content'])
    
        
    def test_date_filter(self):
        c = CRECDoc(
            title='should be in results',
            content='blah',
            date_issued=datetime(2017, 1, 1)
        )
        c2 = CRECDoc(
            title='should NOT be in results',
            content='blah',
            date_issued=datetime(2016, 1, 1)
        )
        c.save(refresh=True)
        c2.save(refresh=True)
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'content': 'blah',
        }
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        results = response_content['data']
        self.assertEquals(1, len(results))
        self.assertEquals('should be in results', results[0]['title'])
    
            
    def test_multi_field(self):
        c = CRECDoc(
            title='foo',
            content='bar',
            date_issued=datetime(2017, 1, 1)
        )
        c2 = CRECDoc(
            title='foo',
            content='baz',
            date_issued=datetime(2016, 1, 1)
        )
        c.save(refresh=True)
        c2.save(refresh=True)
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'content': 'bar',
            'title': 'foo',
        }
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        results = response_content['data']
        self.assertEquals(1, len(results))
    
    
    def test_pagination(self):
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'content': 'foo',
        }
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        max_score1 = max([d['score'] for d in response_content['data']])
        self.assertIsNotNone(max_score1)
        self.assertEquals(10, len(response_content['data']))
        query_args['offset'] = 10
        response = self.client.get('/cwapi/search/', query_args)
        response_content = response.json()
        max_score2 = max([d['score'] for d in response_content['data']])
        self.assertIsNotNone(max_score2)
        self.assertTrue(max_score1 >= max_score2)
        self.assertEquals(10, len(response_content['data']))
#!/usr/bin/env python

from datetime import date, timedelta
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Index, DocType, String

# elasticsearch
client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200'])

# date
for x in range(3,5):
    ddate = date.today() - timedelta(x)
    ddate_str = ddate.strftime('%Y.%m.%d')
    idx = Index("logstash-%s" % ddate_str,using=client)
    idx.delete(ignore=404)
    print("%s is deleted" % ddate_str)
class TestMixins(BaseTestCase):

    def setUp(self):
        super(TestMixins, self).setUp()
        self.doc_type = Token.get_es_doc_type()
        self.index = Index(self.doc_type._doc_type.index)
        self.index.doc_type(self.doc_type)
        self.index.create()
        self.refresh()

    def tearDown(self):
        super(TestMixins, self).tearDown()
        self.index.delete()

    def test_is_indexable(self):
        self.assertTrue(ESIndexableMixin().is_indexable())

    def test_is_index_update_needed(self):
        self.assertTrue(ESIndexableMixin().is_index_update_needed())

    def test_get_indexable_queryset(self):
        self.assertEqual(
            str(Token.get_indexable_queryset().query),
            str(Token.objects.all().query)
        )

    def test_get_es_doc(self):
        token = Token(name="token")
        self.assertIsNone(token.get_es_doc())
        token.save()
        self.assertIsNotNone(token.get_es_doc())

    def test_auto_doc_type_mapping(self):
        person = Person(first_name="Simion", last_name="Baws")
        person.save()
        doc_type = person.get_es_doc_mapping()
        self.assertEqual(doc_type.first_name, person.first_name)
        self.assertEqual(doc_type.last_name, person.last_name)
        self.assertEqual(
            doc_type.full_name,
            u"{0} {1}".format(person.first_name, person.last_name)
        )

    def test_es_index(self):
        # Asynchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index()
        self.assertDocExists(token)

        # Synchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index(async=False)
        self.assertDocExists(token)

        # Fail silently.
        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token = Token.objects.create(name='raise_exception')
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        token.es_index()
        self.assertDocDoesntExist(token)

        # Hard fail.
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False
        with self.assertRaises(RuntimeError):
            token.es_index()
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True

    def test_es_delete(self):
        # Asynchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete()
        self.assertDocDoesntExist(Token, token.pk)

        # Synchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete(async=False)
        self.assertDocDoesntExist(Token, token.pk)

        # Fail silently if document doesn't exist.
        token.es_delete()

        from trampoline import get_trampoline_config
        trampoline_config = get_trampoline_config()

        # Fake delete to raise exception.
        backup_delete = trampoline_config.connection.delete

        def delete_raise_exception(*args, **kwargs):
            raise RuntimeError
        trampoline_config.connection.delete = delete_raise_exception

        # Fail silently
        token.es_delete()

        # Hard fail.
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False
        with self.assertRaises(RuntimeError):
            token.es_delete()
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True

        trampoline_config.connection.delete = backup_delete

    def test_save(self):
        token = Token(name='token')

        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token.save()
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        self.assertDocDoesntExist(token)

        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'token')
        self.assertEqual(doc._id, str(token.pk))

        # Update model and synchronise doc.
        token.name = 'kento'
        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'kento')

        # Instance is not indexable.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)

    def test_delete(self):
        token = Token.objects.create(name='token')
        token_id = token.pk
        self.assertDocExists(token)

        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token.delete()
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        self.assertDocExists(Token, token_id)

        token.save()
        token_id = token.pk
        token.delete()
        self.assertDocDoesntExist(Token, token_id)
Ejemplo n.º 46
0
class ElasticSearchIndex:
    def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(body={'similarity': {
            'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}}
        })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info('Indexing questions and corresponding wikipedia pages as large docs...')
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(
                    page=page,
                    wiki_content=wiki_content, qb_content=qb_content
                )
                answer.save(index=self.name)

    def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info('Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save()

    def search(self, text: str, max_n_guesses: int,
               normalize_score_by_length=False,
               wiki_boost=1, qb_boost=1):
        if not self.exists():
            raise ValueError('The index does not exist, you must create it before searching')

        if wiki_boost != 1:
            wiki_field = 'wiki_content^{}'.format(wiki_boost)
        else:
            wiki_field = 'wiki_content'

        if qb_boost != 1:
            qb_field = 'qb_content^{}'.format(qb_boost)
        else:
            qb_field = 'qb_content'

        s = Search(index=self.name)[0:max_n_guesses].query(
            'multi_match', query=text, fields=[wiki_field, qb_field]
        )
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = len(text.split())
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append((r.page, r.meta.score / query_length))
        return guesses
Ejemplo n.º 47
0
 def handle(self, *args, **options):
     from searching.utils import autodiscover
     for _class in autodiscover():
         index = Index(_class.get_model_index().Meta.index)
         index.delete()
Ejemplo n.º 48
0
class CountTermsTestCase(TestCase):

    def setUp(self):
        self.es_conn = connections.get_connection()
        self.test_crecs = []
        for i in range(20):
            self.test_crecs.append(
                CRECDoc(
                    title=str(i),
                    content='foo bar baz Foo',
                    date_issued=datetime(2017, 1, i % 5 + 1)
                )
            )
        self.index = Index(settings.ES_CW_INDEX)
        CRECDoc.init()
        for c in self.test_crecs:
            c.save(refresh=True)
        self.client = Client()

    def tearDown(self):
        self.index.delete()

    def test_num_docs_found(self):
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 1)
        results = get_term_count_in_doc(
            self.es_conn, 'foo', start_date, end_date
        )
        buckets = get_term_count_agg(results)
        self.assertIsNotNone(buckets)
        self.assertEquals(len(buckets), 1)
        count = buckets[0].get('term_counts', {}).get('value')
        self.assertEquals(count, 8)

    def test_bucketing(self):
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 30)
        results = get_term_count_in_doc(
            self.es_conn, 'foo', start_date, end_date
        )
        buckets = get_term_count_agg(results)
        self.assertIsNotNone(buckets)
        self.assertEquals(len(buckets), 5)
        for b in buckets:
            count = b.get('term_counts', {}).get('value')
            self.assertEquals(count, 8)

    def test_case_sensitivity(self):
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 1)
        results = get_term_count_in_doc(
            self.es_conn, 'FOO', start_date, end_date
        )
        buckets = get_term_count_agg(results)
        self.assertIsNotNone(buckets)
        self.assertEquals(len(buckets), 1)
        count = buckets[0].get('term_counts', {}).get('value')
        self.assertEquals(count, 8)

    def test_api_start_end_specified(self):
        start_date = datetime(2017, 1, 1)
        end_date = datetime(2017, 1, 31)
        query_args = {
            'start_date': start_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'term': 'foo',
        }
        response = self.client.get('/cwapi/term_counts_by_day/', query_args)
        self.assertEquals(200, response.status_code)
        response_content = response.json()
        self.assertEqual('success', response_content['status'])
        self.assertEquals(31, len(response_content['data']['daily_counts']))
        total = 0
        for date_str, count in response_content['data']['daily_counts'].items():
            dt = datetime.strptime(date_str, '%Y-%m-%d')
            self.assertTrue(dt >= start_date and dt <= end_date)
            if dt.day > 5:
                self.assertEquals(0, count)
            else:
                self.assertEquals(8, count)
            total += count
        self.assertEquals(40, total)

    @freeze_time('2017-01-31')
    def test_api_days_ago(self):
        query_args = {
            'days_ago': 30,
            'term': 'foo',
        }
        response = self.client.get('/cwapi/term_counts_by_day/', query_args)
        self.assertEquals(200, response.status_code)
        response_content = response.json()
        self.assertEqual('success', response_content['status'])
        self.assertEquals(31, len(response_content['data']['daily_counts']))
        for date_str in response_content['data']['daily_counts'].keys():
            dt = datetime.strptime(date_str, '%Y-%m-%d')
            self.assertTrue(
                dt >= datetime(2017, 1, 1) and dt <= datetime(2017, 1, 31)
            )