def test_daterange2dates():
    assert_equals(utils.daterange2dates(settings.TEXCAVATOR_DATE_RANGE),
                  [{'lower': '1850-01-01', 'upper': '1990-12-31'}])

    # Input single date
    assert_equals(utils.daterange2dates("19901231"),
                  [{'lower': '1850-01-01', 'upper': '1990-12-31'}])

    # Empty input
    assert_equals(utils.daterange2dates(""),
                  [{'lower': '1850-01-01', 'upper': '1990-12-31'}])
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        query_size = 100000
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []

        for repetition in range(n_repetitions):
            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)
            aggr_resp = multiple_document_word_cloud(settings.ES_INDEX,
                                                     settings.ES_DOCTYPE, None,
                                                     dates[0], [], [], doc_ids)
            response_times.append(int(aggr_resp.get('took')))
            self.stdout.write(str(aggr_resp.get('took')))
            self.stdout.flush()

        avg = float(sum(response_times) / len(response_times))
        print 'Average response time for aggregating over {num} documents: ' \
              '{avg} miliseconds'.format(num=query_size, avg=avg)
    def handle(self, *args, **options):
        query_size = 100000
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []

        for repetition in range(n_repetitions):
            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)
            aggr_resp = multiple_document_word_cloud(settings.ES_INDEX,
                                                     settings.ES_DOCTYPE,
                                                     None,
                                                     dates[0],
                                                     [],
                                                     [],
                                                     doc_ids)
            response_times.append(int(aggr_resp.get('took')))
            self.stdout.write(str(aggr_resp.get('took')))
            self.stdout.flush()

        avg = float(sum(response_times)/len(response_times))
        print 'Average response time for aggregating over {num} documents: ' \
              '{avg} miliseconds'.format(num=query_size, avg=avg)
Ejemplo n.º 4
0
def heatmap(request, query_id, year):
    """
    Retrieves heatmap data for the given Query and year.
    """
    query = get_object_or_404(Query, pk=query_id)
    params = query.get_query_dict()

    year = int(year)
    range = daterange2dates(str(year - 5) + '0101,' + str(year + 5) + '1231')

    result = metadata_aggregation(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  params['query'],
                                  range,
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'],
                                  articles_over_time())

    articles_per_day = {}
    for bucket in result['aggregations']['articles_over_time']['buckets']:
        date = bucket['key'] / 1000  # Cal-HeatMap requires the date in seconds instead of milliseconds
        articles_per_day[date] = bucket['doc_count']

    return JsonResponse(articles_per_day)
Ejemplo n.º 5
0
def test_daterange2dates():
    assert_equals(utils.daterange2dates(settings.TEXCAVATOR_DATE_RANGE),
                  [{
                      'lower': '1850-01-01',
                      'upper': '1990-12-31'
                  }])

    # Input single date
    assert_equals(utils.daterange2dates("19901231"), [{
        'lower': '1850-01-01',
        'upper': '1990-12-31'
    }])

    # Empty input
    assert_equals(utils.daterange2dates(""), [{
        'lower': '1850-01-01',
        'upper': '1990-12-31'
    }])
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        print 'Emptying table...'
        Term.objects.all().delete()

        for timeframe, dates in TIMEFRAMES.items():
            print 'Retrieving documents for timeframe {}...'.format(timeframe)
            exclude_dist = Distribution.objects.exclude(
                name='Landelijk').values_list('id', flat=True)
            date_range = daterange2dates(dates)

            total_documents = count_search_results(settings.ES_INDEX,
                                                   settings.ES_DOCTYPE, None,
                                                   date_range, exclude_dist,
                                                   [], []).get('count')
            print 'Total documents: {}'.format(total_documents)

            sets = document_id_chunks(10000,
                                      settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      None,
                                      date_range,
                                      dist=exclude_dist)

            print 'Counting terms...'
            counter = Counter()
            for n, s in enumerate(sets):
                start_time = time.time()
                counter += termvector_wordcloud(settings.ES_INDEX,
                                                settings.ES_DOCTYPE,
                                                s,
                                                min_length=2,
                                                add_freqs=False)
                print 'Completed set {} in {} seconds...'.format(
                    n + 1,
                    time.time() - start_time)

            print 'Calculating IDFs...'
            terms = []
            for term, count in counter.items():
                if count > 1:  # don't add single occurrences
                    idf = math.log10(total_documents / float(count))
                    terms.append(
                        Term(timeframe=timeframe,
                             word=term,
                             count=count,
                             idf=idf))

            print 'Transferring to database...'
            Term.objects.bulk_create(terms, batch_size=10000)

            print 'Creating RecordDAWG'
            d = dawg.RecordDAWG(
                '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms]))
            d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg'))
        """ Test code below.
    def handle(self, *args, **options):
        print 'Emptying table...'
        Term.objects.all().delete()

        for timeframe, dates in TIMEFRAMES.items():
            print 'Retrieving documents for timeframe {}...'.format(timeframe)
            exclude_dist = Distribution.objects.exclude(name='Landelijk').values_list('id', flat=True)
            date_range = daterange2dates(dates)

            total_documents = count_search_results(settings.ES_INDEX,
                                                   settings.ES_DOCTYPE,
                                                   None,
                                                   date_range,
                                                   exclude_dist, [], []).get('count')
            print 'Total documents: {}'.format(total_documents)

            sets = document_id_chunks(10000,
                                      settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      None,
                                      date_range,
                                      dist=exclude_dist)

            print 'Counting terms...'
            counter = Counter()
            for n, s in enumerate(sets):
                start_time = time.time()
                counter += termvector_wordcloud(settings.ES_INDEX,
                                                settings.ES_DOCTYPE,
                                                s,
                                                min_length=2,
                                                add_freqs=False)
                print 'Completed set {} in {} seconds...'.format(n + 1, time.time() - start_time)

            print 'Calculating IDFs...'
            terms = []
            for term, count in counter.items():
                if count > 1:  # don't add single occurrences
                    idf = math.log10(total_documents / float(count))
                    terms.append(Term(timeframe=timeframe, word=term, count=count, idf=idf))

            print 'Transferring to database...'
            Term.objects.bulk_create(terms, batch_size=10000)

            print 'Creating RecordDAWG'
            d = dawg.RecordDAWG('<d', zip([t.word for t in terms], [(t.idf,) for t in terms]))
            d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg'))

        """ Test code below.
Ejemplo n.º 8
0
def get_search_parameters(req_dict):
    """Return a tuple of search parameters extracted from a dictionary

    Parameters:
        req_dict : dict
            A Django request dictionary

    Returns:
        dict : dict
            A dictionary that contains query metadata
    """
    query_str = req_dict.get('query', None)

    start = int(req_dict.get('startRecord', 1))

    result_size = int(req_dict.get('maximumRecords', 20))

    date_range_str = req_dict.get('dateRange', settings.TEXCAVATOR_DATE_RANGE)
    dates = daterange2dates(date_range_str)

    distributions = []
    for ds in _KB_DISTRIBUTION_VALUES.keys():
        use_ds = json.loads(req_dict.get(ds, "true"))
        if not use_ds:
            distributions.append(ds)

    article_types = []
    for typ in _KB_ARTICLE_TYPE_VALUES:
        use_type = json.loads(req_dict.get(typ, "true"))
        if not use_type:
            article_types.append(typ)

    pillars = [int(x) for x in req_dict.getlist('pillars')]
    collection = req_dict.get('collection', settings.ES_INDEX)
    sort_order = req_dict.get('sort_order', '_score')

    return {
        'query': query_str,
        'start': start,
        'result_size': result_size,
        'dates': dates,
        'distributions': distributions,
        'article_types': article_types,
        'pillars': pillars,
        'collection': collection,
        'sort_order': sort_order
    }
Ejemplo n.º 9
0
def get_search_parameters(req_dict):
    """Return a tuple of search parameters extracted from a dictionary

    Parameters:
        req_dict : dict
            A Django request dictionary

    Returns:
        dict : dict
            A dictionary that contains query metadata
    """
    query_str = req_dict.get('query', None)

    start = int(req_dict.get('startRecord', 1))

    result_size = int(req_dict.get('maximumRecords', 20))

    date_range_str = req_dict.get('dateRange', settings.TEXCAVATOR_DATE_RANGE)
    dates = daterange2dates(date_range_str)

    distributions = []
    for ds in _KB_DISTRIBUTION_VALUES.keys():
        use_ds = json.loads(req_dict.get(ds, "true"))
        if not use_ds:
            distributions.append(ds)

    article_types = []
    for typ in _KB_ARTICLE_TYPE_VALUES:
        use_type = json.loads(req_dict.get(typ, "true"))
        if not use_type:
            article_types.append(typ)

    pillars = [int(x) for x in req_dict.getlist('pillars')]
    collection = req_dict.get('collection', settings.ES_INDEX)
    sort_order = req_dict.get('sort_order', '_score')

    return {
        'query': query_str,
        'start': start,
        'result_size': result_size,
        'dates': dates,
        'distributions': distributions,
        'article_types': article_types,
        'pillars': pillars,
        'collection': collection,
        'sort_order': sort_order
    }
Ejemplo n.º 10
0
    def handle(self, *args, **options):
        if QueryTerm.objects.all().count() == 0:
            print 'No query terms stored in the database. Please run ' \
                  'python manage.py gatherqueryterms\' first.'
            sys.exit(1)

        query_size = 10
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []
        es_wall_clock = []

        for repetition in range(n_repetitions):
            # generate random weigthed query
            query_terms = QueryTerm.objects.order_by('?')[0:query_size]

            query_list = [
                '{}^{}'.format(t.term, randint(1, 40)) for t in query_terms
            ]
            q = ' OR '.join(query_list)

            t1 = time.time()
            dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)
            valid_q, result = do_search(settings.ES_INDEX, settings.ES_DOCTYPE,
                                        q, 0, 20, dates[0], [], [])
            t2 = time.time()

            if not valid_q:
                print 'Invalid query: {}'.format(q)
            else:
                es_wall_clock.append((t2 - t1) * 1000)
                response_times.append(int(result.get('took')))
                self.stdout.write(str(result.get('took')))
                self.stdout.flush()

        avg = float(sum(response_times) / len(response_times))
        avg_wall_clock = float(sum(es_wall_clock) / len(es_wall_clock))
        print 'Average response time for queries of size {}: {} miliseconds'. \
              format(query_size, avg)
        print 'Average wall clock time for queries of size {}: {} ' \
              'miliseconds'.format(query_size, avg_wall_clock)
    def handle(self, *args, **options):
        if QueryTerm.objects.all().count() == 0:
            print 'No query terms stored in the database. Please run ' \
                  'python manage.py gatherqueryterms\' first.'
            sys.exit(1)

        query_size = 10
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []
        es_wall_clock = []

        for repetition in range(n_repetitions):
            # generate random weigthed query
            query_terms = QueryTerm.objects.order_by('?')[0:query_size]

            query_list = ['{}^{}'.format(t.term, randint(1, 40))
                          for t in query_terms]
            q = ' OR '.join(query_list)

            t1 = time.time()
            dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)
            valid_q, result = do_search(settings.ES_INDEX, settings.ES_DOCTYPE,
                                        q, 0, 20, dates[0], [], [])
            t2 = time.time()

            if not valid_q:
                print 'Invalid query: {}'.format(q)
            else:
                es_wall_clock.append((t2-t1)*1000)
                response_times.append(int(result.get('took')))
                self.stdout.write(str(result.get('took')))
                self.stdout.flush()

        avg = float(sum(response_times)/len(response_times))
        avg_wall_clock = float(sum(es_wall_clock)/len(es_wall_clock))
        print 'Average response time for queries of size {}: {} miliseconds'. \
              format(query_size, avg)
        print 'Average wall clock time for queries of size {}: {} ' \
              'miliseconds'.format(query_size, avg_wall_clock)
Ejemplo n.º 12
0
def index(request):
    """Render main page."""

    from services.es import _KB_DISTRIBUTION_VALUES, _KB_ARTICLE_TYPE_VALUES

    config_reverse_mapping = {"sd": flip_dict(_KB_DISTRIBUTION_VALUES), "st": flip_dict(_KB_ARTICLE_TYPE_VALUES)}

    date_limits = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)

    data = {
        "PROJECT_NAME": settings.PROJECT_NAME,
        "PROJECT_MIN_DATE": date_limits[0]["lower"],
        "PROJECT_MAX_DATE": date_limits[0]["upper"],
        "QUERY_DATA_DOWNLOAD_ALLOW": settings.QUERY_DATA_DOWNLOAD_ALLOW,
        "ES_INDEX": settings.ES_INDEX,
        "ES_REVERSE_MAPPING": json.dumps(config_reverse_mapping),
        "ILPS_LOGGING": settings.ILPS_LOGGING,
    }

    return render_to_response("index.html", data, RequestContext(request))
Ejemplo n.º 13
0
    def handle(self, *args, **options):
        print "Emptying table..."
        DayStatistic.objects.all().delete()

        dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)

        year_lower = datetime.strptime(dates[0]["lower"], "%Y-%m-%d").date().year
        year_upper = datetime.strptime(dates[0]["upper"], "%Y-%m-%d").date().year

        if len(args) > 0:
            year_lower = int(args[0])
        if len(args) > 1:
            year_upper = int(args[1])

        print "Gathering statistics from %s until %s." % (year_lower, year_upper)

        agg_name = "daystatistic"

        for year in range(year_lower, year_upper + 1):
            date_range = {"lower": "{y}-01-01".format(y=year), "upper": "{y}-12-31".format(y=year)}

            print year

            results = day_statistics(settings.ES_INDEX, settings.ES_DOCTYPE, date_range, agg_name)

            if results:
                # save results to database
                agg_data = results["aggregations"][agg_name]["buckets"]

                for date in agg_data:
                    try:
                        d = datetime.strptime(date["key_as_string"], "%Y-%m-%dT00:00:00.000Z").date()
                        DayStatistic.objects.create(date=str(d), count=date["doc_count"])
                    except DatabaseError, exc:
                        msg = "Database Error: %s" % exc
                        if settings.DEBUG:
                            print msg
Ejemplo n.º 14
0
    def handle(self, *args, **options):
        print 'Emptying table...'
        DayStatistic.objects.all().delete()

        dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)

        year_lower = datetime.strptime(dates[0]['lower'],
                                       '%Y-%m-%d').date().year
        year_upper = datetime.strptime(dates[0]['upper'],
                                       '%Y-%m-%d').date().year

        if len(args) > 0:
            year_lower = int(args[0])
        if len(args) > 1:
            year_upper = int(args[1])

        print 'Gathering statistics from %s until %s.' \
            % (year_lower, year_upper)

        agg_name = 'daystatistic'

        for year in range(year_lower, year_upper + 1):
            date_range = {
                'lower': '{y}-01-01'.format(y=year),
                'upper': '{y}-12-31'.format(y=year)
            }

            print year

            for article_type in ArticleType.objects.all():
                for distribution in Distribution.objects.all():
                    results = day_statistics(settings.ES_INDEX,
                                             settings.ES_DOCTYPE, date_range,
                                             agg_name, distribution.id,
                                             article_type.id)
                    self.save_to_database(agg_name, results, distribution,
                                          article_type)
Ejemplo n.º 15
0
def heatmap(request, query_id, year):
    """
    Retrieves heatmap data for the given Query and year.
    """
    query = get_object_or_404(Query, pk=query_id)
    params = query.get_query_dict()

    year = int(year)
    range = daterange2dates(str(year - 5) + '0101,' + str(year + 5) + '1231')

    result = metadata_aggregation(settings.ES_INDEX, settings.ES_DOCTYPE,
                                  params['query'], range,
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'],
                                  articles_over_time())

    articles_per_day = {}
    for bucket in result['aggregations']['articles_over_time']['buckets']:
        date = bucket[
            'key'] / 1000  # Cal-HeatMap requires the date in seconds instead of milliseconds
        articles_per_day[date] = bucket['doc_count']

    return JsonResponse(articles_per_day)
    def handle(self, *args, **options):
        print 'Emptying table...'
        DayStatistic.objects.all().delete()

        dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE)

        year_lower = datetime.strptime(dates[0]['lower'], '%Y-%m-%d').date().year
        year_upper = datetime.strptime(dates[0]['upper'], '%Y-%m-%d').date().year

        if len(args) > 0:
            year_lower = int(args[0])
        if len(args) > 1:
            year_upper = int(args[1])

        print 'Gathering statistics from %s until %s.' \
            % (year_lower, year_upper)

        agg_name = 'daystatistic'

        for year in range(year_lower, year_upper+1):
            date_range = {
                'lower': '{y}-01-01'.format(y=year),
                'upper': '{y}-12-31'.format(y=year)
            }

            print year

            for article_type in ArticleType.objects.all():
                for distribution in Distribution.objects.all():
                    results = day_statistics(settings.ES_INDEX,
                                             settings.ES_DOCTYPE,
                                             date_range,
                                             agg_name,
                                             distribution.id,
                                             article_type.id)
                    self.save_to_database(agg_name, results, distribution, article_type)
Ejemplo n.º 17
0
def tv_cloud(request):
    """Generate termvector word cloud using the termvector approach.

    Returns word cloud data for a single document word cloud (based on a single
    document id) and multiple document word clouds (either based on a list of
    document ids (i.e., timeline burst cloud) or a query with metadata).

    For multiple document word clouds, a celery task generates the cloud data.
    """
    if settings.DEBUG:
        print >> stderr, "termvector cloud()"
    logger.info('services/cloud/ - termvector word cloud')
    logger.info('services/cloud/ - user: {}'.format(request.user.username))

    # Retrieve the cloud settings
    query_id = request.GET.get('queryID')
    min_length = int(request.GET.get('min_length', 2))
    use_stopwords = request.GET.get('stopwords') == "1"
    use_default_stopwords = request.GET.get('stopwords_default') == "1"
    stems = request.GET.get('stems') == "1"

    # Retrieve the stopwords
    stopwords = []
    if use_stopwords:
        stopwords_user = list(
            StopWord.objects.filter(user=request.user).filter(
                query=None).values_list('word', flat=True))

        stopwords_query = []
        if query_id:
            stopwords_query = list(
                StopWord.objects.filter(user=request.user).filter(
                    query__id=query_id).values_list('word', flat=True))

        stopwords_default = []
        if use_default_stopwords:
            stopwords_default = list(
                StopWord.objects.filter(user=None).filter(
                    query=None).values_list('word', flat=True))

        stopwords = stopwords_user + stopwords_query + stopwords_default

    record_id = request.GET.get('record_id')
    logger.info('services/cloud/ - record_id: {}'.format(record_id))

    idf_timeframe = request.GET.get('idf_timeframe')

    if record_id:
        # Cloud for a single document
        t_vector = single_document_word_cloud(settings.ES_INDEX,
                                              settings.ES_DOCTYPE, record_id,
                                              min_length, stopwords, stems)
        normalized = normalize_cloud(t_vector['result'], idf_timeframe)
        return json_response_message('ok', 'Word cloud generated',
                                     {'result': normalized})
    else:
        # Cloud for a query
        logger.info('services/cloud/ - multiple document word cloud')

        query = get_object_or_404(Query, pk=query_id)
        params = query.get_query_dict()

        # If we're creating a timeline cloud, set the min/max dates
        date_range = None
        if request.GET.get('is_timeline'):
            date_range = daterange2dates(request.GET.get('date_range'))

        task = generate_tv_cloud.delay(params, min_length, stopwords,
                                       date_range, stems, idf_timeframe)
        logger.info('services/cloud/ - Celery task id: {}'.format(task.id))

        return json_response_message('ok', '', {'task': task.id})
Ejemplo n.º 18
0
def tv_cloud(request):
    """Generate termvector word cloud using the termvector approach.

    Returns word cloud data for a single document word cloud (based on a single
    document id) and multiple document word clouds (either based on a list of
    document ids (i.e., timeline burst cloud) or a query with metadata).

    For multiple document word clouds, a celery task generates the cloud data.
    """
    if settings.DEBUG:
        print >> stderr, "termvector cloud()"
    logger.info('services/cloud/ - termvector word cloud')
    logger.info('services/cloud/ - user: {}'.format(request.user.username))

    # Retrieve the cloud settings
    query_id = request.GET.get('queryID')
    min_length = int(request.GET.get('min_length', 2))
    use_stopwords = request.GET.get('stopwords') == "1"
    use_default_stopwords = request.GET.get('stopwords_default') == "1"
    stems = request.GET.get('stems') == "1"

    # Retrieve the stopwords
    stopwords = []
    if use_stopwords:
        stopwords_user = list(StopWord.objects
                              .filter(user=request.user)
                              .filter(query=None)
                              .values_list('word', flat=True))

        stopwords_query = []
        if query_id:
            stopwords_query = list(StopWord.objects
                                   .filter(user=request.user)
                                   .filter(query__id=query_id)
                                   .values_list('word', flat=True))

        stopwords_default = []
        if use_default_stopwords:
            stopwords_default = list(StopWord.objects
                                     .filter(user=None)
                                     .filter(query=None)
                                     .values_list('word', flat=True))

        stopwords = stopwords_user + stopwords_query + stopwords_default

    record_id = request.GET.get('record_id')
    logger.info('services/cloud/ - record_id: {}'.format(record_id))

    idf_timeframe = request.GET.get('idf_timeframe')
    
    if record_id:
        # Cloud for a single document
        t_vector = single_document_word_cloud(settings.ES_INDEX,
                                              settings.ES_DOCTYPE,
                                              record_id,
                                              min_length,
                                              stopwords,
                                              stems)
        normalized = normalize_cloud(t_vector['result'], idf_timeframe)
        return json_response_message('ok', 'Word cloud generated', {'result': normalized})
    else:
        # Cloud for a query
        logger.info('services/cloud/ - multiple document word cloud')

        query = get_object_or_404(Query, pk=query_id)
        params = query.get_query_dict()

        # If we're creating a timeline cloud, set the min/max dates
        date_range = None
        if request.GET.get('is_timeline'):
            date_range = daterange2dates(request.GET.get('date_range'))

        task = generate_tv_cloud.delay(params, min_length, stopwords, date_range, stems, idf_timeframe)
        logger.info('services/cloud/ - Celery task id: {}'.format(task.id))

        return json_response_message('ok', '', {'task': task.id})