def reload_period_from_analytics(cls, period, verbose=False): """Replace the stats for the given period from Google Analytics.""" counts = googleanalytics.pageviews_by_document(*period_dates(period), verbose=verbose) if counts: # Close any existing connections because our load balancer times # them out at 5 minutes and the GA calls take forever. close_old_connections() # Delete and remake the rows: # Horribly inefficient until # http://code.djangoproject.com/ticket/9519 is fixed. # cls.objects.filter(period=period).delete() # Instead, we use raw SQL! cursor = connection.cursor() cursor.execute( "DELETE FROM `dashboards_wikidocumentvisits`" " WHERE `period` = %s", [period] ) # Now we create them again with fresh data. for doc_id, visits in counts.items(): cls.objects.create(document=Document(pk=doc_id), visits=visits, period=period) else: # Don't erase interesting data if there's nothing to replace it: log.warning("Google Analytics returned no interesting data," " so I kept what I had.")
def test_document_translate_fallback(self): d_en = DocumentFactory(locale='en-US', title=u'How to delete Google Chrome?') invalid_translate = reverse('wiki.document', locale='tr', args=[d_en.slug]) self.assertEqual(d_en, Document.from_url(invalid_translate))
def test_document_translate_fallback(self): d_en = document(locale='en-US', title=u'How to delete Google Chrome?', save=True) invalid_translate = reverse('wiki.document', locale='tr', args=[d_en.slug]) self.assertEqual(d_en, Document.from_url(invalid_translate))
def test_document_translate_fallback(self): d_en = DocumentFactory(locale="en-US", title="How to delete Google Chrome?") invalid_translate = reverse("wiki.document", locale="tr", args=[d_en.slug]) self.assertEqual(d_en, Document.from_url(invalid_translate))
def document(**kwargs): """Return an empty document with enough stuff filled out that it can be saved.""" defaults = { 'category': CATEGORIES[0][0], 'title': u'đ' + str(datetime.now()) } defaults.update(kwargs) if 'slug' not in kwargs: defaults['slug'] = slugify(defaults['title']) return Document(**defaults)
def pageviews_by_document(start_date, end_date): """Return the number of pageviews by document in a given date range. * Only returns en-US documents for now since that's what we did with webtrends. Returns a dict with pageviews for each document: {<document_id>: <pageviews>, 1: 42, 7: 1337,...} """ counts = {} request = _build_request() start_index = 1 max_results = 10000 while True: # To deal with pagination @retry_503 def _make_request(): return request.get( ids='ga:' + profile_id, start_date=str(start_date), end_date=str(end_date), metrics='ga:pageviews', dimensions='ga:pagePath', filters='ga:pagePathLevel2==/kb/;ga:pagePathLevel1==/en-US/', max_results=max_results, start_index=start_index).execute() results = _make_request() for result in results['rows']: path = result[0] pageviews = int(result[1]) doc = Document.from_url(path, id_only=True, check_host=False) if not doc: continue # The same document can appear multiple times due to url params. counts[doc.pk] = counts.get(doc.pk, 0) + pageviews # Move to next page of results. start_index += max_results if start_index > results['totalResults']: break return counts
def reload_period_from_analytics(cls, period): """Replace the stats for the given period from Google Analytics.""" counts = googleanalytics.pageviews_by_document(*period_dates(period)) if counts: # Delete and remake the rows: # Horribly inefficient until # http://code.djangoproject.com/ticket/9519 is fixed. cls.objects.filter(period=period).delete() for doc_id, visits in counts.iteritems(): cls.objects.create(document=Document(pk=doc_id), visits=visits, period=period) else: # Don't erase interesting data if there's nothing to replace it: log.warning('Google Analytics returned no interesting data,' ' so I kept what I had.')
def related_documents(self): """Return documents that are 'morelikethis' one""" if not self.product: return [] # First try to get the results from the cache key = 'questions_question:related_docs:%s' % self.id documents = cache.get(key) if documents is not None: statsd.incr('questions.related_documents.cache.hit') log.debug( 'Getting MLT documents for {question} from cache.'.format( question=repr(self))) return documents try: statsd.incr('questions.related_documents.cache.miss') s = Document.get_mapping_type().search() documents = (s.values_dict('id', 'document_title', 'url').filter( document_locale=self.locale, document_is_archived=False, document_category__in=settings.IA_DEFAULT_CATEGORIES, product__in=[self.product.slug] ).query( __mlt={ 'fields': ['document_title', 'document_summary', 'document_content'], 'like_text': self.title, 'min_term_freq': 1, 'min_doc_freq': 1 })[:3]) documents = list(documents) cache.add(key, documents) except ES_EXCEPTIONS: statsd.incr('questions.related_documents.esexception') log.exception('ES MLT related_documents') documents = [] return documents
def related_documents(self): """Return documents that are 'morelikethis' one""" if not self.product: return [] # First try to get the results from the cache key = 'questions_question:related_docs:%s' % self.id documents = cache.get(key) if documents is not None: statsd.incr('questions.related_documents.cache.hit') log.debug('Getting MLT documents for {question} from cache.' .format(question=repr(self))) return documents try: statsd.incr('questions.related_documents.cache.miss') s = Document.get_mapping_type().search() documents = ( s.values_dict('id', 'document_title', 'url') .filter(document_locale=self.locale, document_is_archived=False, document_category__in=settings.IA_DEFAULT_CATEGORIES, product__in=[self.product.slug]) .query(__mlt={ 'fields': ['document_title', 'document_summary', 'document_content'], 'like_text': self.title, 'min_term_freq': 1, 'min_doc_freq': 1}) [:3]) documents = list(documents) cache.add(key, documents) except ES_EXCEPTIONS: statsd.incr('questions.related_documents.esexception') log.exception('ES MLT related_documents') documents = [] return documents
class UntranslatedReadout(Readout): title = _lazy(u'Untranslated') description = _lazy( u'This indicates there are no approved translations of these articles. ' 'Some of the articles may have proposed translations waiting to be ' 'reviewed and will appear in the Unreviewed Changes section as well.') short_title = _lazy(u'Untranslated') details_link_text = _lazy(u'All untranslated articles...') slug = 'untranslated' column4_label = _lazy(u'Updated') def _query_and_params(self, max): # Filter by product if specified. if self.product: extra_joins = PRODUCT_FILTER params = (self.locale, LAST_30_DAYS, self.product.id, settings.WIKI_DEFAULT_LANGUAGE) else: extra_joins = '' params = (self.locale, LAST_30_DAYS, settings.WIKI_DEFAULT_LANGUAGE) # Incidentally, we tried this both as a left join and as a search # against an inner query returning translated docs, and the left join # yielded a faster-looking plan (on a production corpus). # # Find non-archived, localizable documents in categories 10, # 20 and 60 having at least one ready- for-localization # revision. Of those, show the ones that have no translation. query = ( 'SELECT engdoc.slug, engdoc.title, ' 'wiki_revision.reviewed, dashboards_wikidocumentvisits.visits ' 'FROM wiki_document engdoc ' 'INNER JOIN wiki_revision ON ' 'engdoc.latest_localizable_revision_id=wiki_revision.id ' 'LEFT JOIN wiki_document translated ON ' 'engdoc.id=translated.parent_id AND translated.locale=%s ' 'LEFT JOIN dashboards_wikidocumentvisits ON ' 'engdoc.id=dashboards_wikidocumentvisits.document_id AND ' 'dashboards_wikidocumentvisits.period=%s ' + extra_joins + 'WHERE ' '(translated.id IS NULL OR translated.current_revision_id IS NULL) ' 'AND engdoc.is_localizable AND ' 'engdoc.category in (10, 20, 60) AND ' 'engdoc.locale=%s AND NOT engdoc.is_archived ' 'AND wiki_revision.content NOT LIKE "REDIRECT%%" ' + self._order_clause() + self._limit_clause(max)) return query, params def _order_clause(self): return ('ORDER BY wiki_revision.reviewed DESC, engdoc.title ASC' if self.mode == MOST_RECENT else 'ORDER BY dashboards_wikidocumentvisits.visits DESC, ' 'engdoc.title ASC') def _format_row(self, (slug, title, reviewed, visits)): # Run the data through the model to (potentially) format it and # take advantage of SPOTs (like for get_absolute_url()): d = Document(slug=slug, title=title, locale=settings.WIKI_DEFAULT_LANGUAGE) return dict(title=d.title, url=d.get_absolute_url(), visits=visits, updated=reviewed)
def test_document_translate_fallback(self): d_en = document(locale="en-US", title=u"How to delete Google Chrome?", save=True) invalid_translate = reverse("wiki.document", locale="tr", args=[d_en.slug]) self.assertEqual(d_en, Document.from_url(invalid_translate))
def pageviews_by_document(start_date, end_date, verbose=False): """Return the number of pageviews by document in a given date range. * Only returns en-US documents for now since that's what we did with webtrends. Returns a dict with pageviews for each document: {<document_id>: <pageviews>, 1: 42, 7: 1337,...} """ counts = {} request = _build_request() max_results = 10000 end_date_step = end_date while True: # To reduce the size of result set request 3 months at a time start_date_step = end_date_step - timedelta(90) if start_date_step < start_date: start_date_step = start_date if verbose: print 'Fetching data for {0!s} to {1!s}:'.format(start_date_step, end_date_step) start_index = 1 while True: # To deal with pagination @retry_503 def _make_request(): return request.get( ids='ga:' + profile_id, start_date=str(start_date_step), end_date=str(end_date_step), metrics='ga:pageviews', dimensions='ga:pagePath', filters=('ga:pagePathLevel2==/kb/;' 'ga:pagePathLevel1==/en-US/'), max_results=max_results, start_index=start_index).execute() results = _make_request() if verbose: d = (max_results - 1 if start_index + max_results - 1 < results['totalResults'] else results['totalResults'] - start_index) print '- Got {0!s} of {1!s} results.'.format(start_index + d, results['totalResults']) for result in results.get('rows', []): path = result[0] pageviews = int(result[1]) doc = Document.from_url(path, id_only=True, check_host=False) if not doc: continue # The same document can appear multiple times due to url params counts[doc.pk] = counts.get(doc.pk, 0) + pageviews # Move to next page of results. start_index += max_results if start_index > results.get('totalResults', 0): break end_date_step = start_date_step - timedelta(1) if start_date_step == start_date or end_date_step < start_date: break return counts
def pageviews_by_document(start_date, end_date, verbose=False): """Return the number of pageviews by document in a given date range. * Only returns en-US documents for now since that's what we did with webtrends. Returns a dict with pageviews for each document: {<document_id>: <pageviews>, 1: 42, 7: 1337,...} """ counts = {} request = _build_request() max_results = 10000 end_date_step = end_date while True: # To reduce the size of result set request 3 months at a time start_date_step = end_date_step - timedelta(90) if start_date_step < start_date: start_date_step = start_date if verbose: print 'Fetching data for %s to %s:' % (start_date_step, end_date_step) start_index = 1 while True: # To deal with pagination @retry_503 def _make_request(): return request.get(ids='ga:' + profile_id, start_date=str(start_date_step), end_date=str(end_date_step), metrics='ga:pageviews', dimensions='ga:pagePath', filters=('ga:pagePathLevel2==/kb/;' 'ga:pagePathLevel1==/en-US/'), max_results=max_results, start_index=start_index).execute() results = _make_request() if verbose: d = (max_results - 1 if start_index + max_results - 1 < results['totalResults'] else results['totalResults'] - start_index) print '- Got %s of %s results.' % (start_index + d, results['totalResults']) for result in results['rows']: path = result[0] pageviews = int(result[1]) doc = Document.from_url(path, id_only=True, check_host=False) if not doc: continue # The same document can appear multiple times due to url params counts[doc.pk] = counts.get(doc.pk, 0) + pageviews # Move to next page of results. start_index += max_results if start_index > results['totalResults']: break end_date_step = start_date_step - timedelta(1) if start_date_step == start_date or end_date_step < start_date: break return counts