def index_articles(self, titles=[], start=0, num=1000000): """ @param titles: list of url_titles to retrieve. @param start: int Index of list at which to start. @param num: int Number of articles to index, beginning at start. @returns: (int num posted, list Articles that could not be posted) """ posted = 0 could_not_post = [] for n,title in enumerate(titles): if (posted < num) and (n > start): logging.debug('%s/%s %s' % (n, len(titles), title)) page = Proxy().page(title) if (page.published or settings.MEDIAWIKI_SHOW_UNPUBLISHED): page_sources = [source['encyclopedia_id'] for source in page.sources] for source in page.sources: logging.debug(' %s' % source['encyclopedia_id']) docstore.post( settings.DOCSTORE_HOSTS, settings.DOCSTORE_INDEX, 'sources', source['encyclopedia_id'], source ) page.sources = page_sources docstore.post( settings.DOCSTORE_HOSTS, settings.DOCSTORE_INDEX, 'articles', title, page.__dict__ ) posted = posted + 1 logging.debug('posted %s' % posted) else: could_not_post.append(page) if could_not_post: logging.debug('Could not post these: %s' % could_not_post) return posted,could_not_post
def index_authors(self, titles=[]): """ @param titles: list of url_titles to retrieve. """ for n,title in enumerate(titles): logging.debug('%s/%s %s' % (n, len(titles), title)) page = Proxy().page(title) docstore.post( settings.DOCSTORE_HOSTS, settings.DOCSTORE_INDEX, 'authors', title, page.__dict__ )
def index_topics(self, json_text=None, url=settings.DDR_TOPICS_SRC_URL): """Upload topics.json; used for Encyc->DDR links on article pages. url = 'http://partner.densho.org/vocab/api/0.2/topics.json' models.Elasticsearch().index_topics(url) @param json_text: unicode Raw topics.json file text. @param url: URL of topics.json """ if url and not json_text: r = requests.get(url) if r.status_code == 200: json_text = r.text docstore.post( settings.DOCSTORE_HOSTS, settings.DOCSTORE_INDEX, 'vocab', 'topics', json.loads(json_text), )