def load_essay(essay_url, index=True, session=None): """ Load an essay from an RDFa HTML document. """ if session is None: session = requests.Session() # extract metadata from the html LOGGER.info("loading essay %s", essay_url) # create the essay instance url_parts = urlparse.urlparse(essay_url) essay_id = url_parts[2].split("/")[2] response = session.get(essay_url) response.raise_for_status() doc = BeautifulSoup(response.text, "html.parser") essay = Essay(id=essay_id) essay.title = doc.title.text.strip() essay.created = doc.find_all(property="dcterms:created")[0]["content"] essay.modified = doc.find_all(property="dcterms:modified")[0]["content"] essay.creator = _lookup_awardee( doc.find_all(property="dcterms:creator")[0]["content"]) description = doc.find_all(property="dcterms:description")[0] description = "".join(map(str, description.contents)) essay.html = description essay.essay_editor_url = essay_url essay.save() # so we can assign titles # attach any titles that the essay is about for title_uri in doc.find_all(property="dcterms:subject"): lccn = _lccn_from_title_uri(title_uri["content"]) # load titles from web if not available try: title = Title.objects.get(lccn=lccn) except Title.DoesNotExist: management.call_command( "load_titles", "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn) title = Title.objects.get(lccn=lccn) # attach the title to the essay essay.titles.add(title) # index the title in solr if necessary if index: index_title(title) LOGGER.info("loaded essay: %s", essay_url) return essay
def load_essay(essay_url, index=True): """ Load an essay from an RDFa HTML document. """ # extract metadata from the html LOGGER.info("loading essay %s", essay_url) # create the essay instance url_parts = urlparse.urlparse(essay_url) essay_id = url_parts[2].split("/")[2] r = requests.get(essay_url) doc = BeautifulSoup(r.text, 'html.parser') essay = Essay(id=essay_id) essay.title = doc.title.text.strip() essay.created = doc.find_all(property="dcterms:created")[0]['content'] essay.modified = doc.find_all(property="dcterms:modified")[0]['content'] essay.creator = _lookup_awardee( doc.find_all(property="dcterms:creator")[0]['content']) description = doc.find_all(property="dcterms:description")[0] description = ''.join(map(str, description.contents)) essay.html = description essay.essay_editor_url = essay_url essay.save() # so we can assign titles # attach any titles that the essay is about for title_uri in doc.find_all(property="dcterms:subject"): lccn = _lccn_from_title_uri(title_uri['content']) # load titles from web if not available try: title = Title.objects.get(lccn=lccn) except Exception: # FIXME: this should only handle expected exceptions management.call_command( 'load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn) title = Title.objects.get(lccn=lccn) # attach the title to the essay essay.titles.add(title) # index the title in solr if necessary if index: index_title(title) LOGGER.info("loaded essay: %s", essay_url) return essay
def purge_essay(essay_url, index=True): """ Purge an essay from the database. """ try: essay = Essay.objects.get(essay_editor_url=essay_url) titles = list(essay.titles.all()) essay.delete() LOGGER.info("deleted essay %s", essay_url) # reindex titles if index: for title in titles: index_title(title) except Essay.DoesNotExist: raise Exception("No such essay loaded from %s" % essay_url)
def purge_essay(essay_url, index=True): """ Purge an essay from the database. """ try: essay = Essay.objects.get(essay_editor_url=essay_url) titles = list(essay.titles.all()) essay.delete() logging.info("deleted essay %s" % essay_url) # reindex titles if index: for title in titles: index_title(title) except Essay.DoesNotExist: raise Exception("No such essay loaded from %s" % essay_url)
def load_essay(essay_url, index=True): """ Load an essay from an RDFa HTML document. """ # extract metadata from the html LOGGER.info("loading essay %s" % essay_url) # create the essay instance url_parts = urlparse.urlparse(essay_url) essay_id = url_parts[2].split("/")[2] r = requests.get(essay_url) doc = BeautifulSoup(r.text, 'html.parser') essay = Essay(id=essay_id) essay.title = doc.title.text.strip() essay.created = doc.find_all(property="dcterms:created")[0]['content'] essay.modified = doc.find_all(property="dcterms:modified")[0]['content'] essay.creator = _lookup_awardee(doc.find_all(property="dcterms:creator")[0]['content']) description = doc.find_all(property="dcterms:description")[0] description = ''.join(map(str, description.contents)) essay.html = description essay.essay_editor_url = essay_url essay.save() # so we can assign titles # attach any titles that the essay is about for title_uri in doc.find_all(property="dcterms:subject"): lccn = _lccn_from_title_uri(title_uri['content']) # load titles from web if not available try: title = Title.objects.get(lccn=lccn) except Exception: # FIXME: this should only handle expected exceptions management.call_command('load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn) title = Title.objects.get(lccn=lccn) # attach the title to the essay essay.titles.add(title) # index the title in solr if necessary if index: index_title(title) LOGGER.info("loaded essay: %s" % essay_url) return essay
def load_essay(essay_url, index=True): """ Load an essay from an RDFa HTML document. """ # extract metadata from the html logging.info("loading essay %s" % essay_url) g = Graph() g.parse(essay_url, format='rdfa', html5=True, encoding='utf-8') # create the essay instance essay_uri = URIRef(essay_url) essay_id = _essay_id(essay_uri) modified = g.value(essay_uri, DC.modified).toPython() essay = Essay(id=essay_id) essay.title = unicode(g.value(essay_uri, DC.title)).strip() essay.created = g.value(essay_uri, DC.created).toPython() essay.modified = g.value(essay_uri, DC.modified).toPython() essay.creator = _lookup_awardee((g.value(essay_uri, DC.creator))) essay.html = unicode(g.value(essay_uri, DC.description)) essay.essay_editor_url = essay_url essay.save() # so we can assign titles # attach any titles that the essay is about for title_uri in g.objects(essay_uri, DC.subject): lccn = _lccn_from_title_uri(title_uri) # load titles from web if not available try: title = Title.objects.get(lccn=lccn) except Exception, e: management.call_command( 'load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn) title = Title.objects.get(lccn=lccn) # attach the title to the essay essay.titles.add(title) # index the title in solr if necessary if index: index_title(title)
def load_essay(essay_url, index=True): """ Load an essay from an RDFa HTML document. """ # extract metadata from the html logging.info("loading essay %s" % essay_url) g = Graph() g.parse(essay_url, format='rdfa') # create the essay instance essay_uri = URIRef(essay_url) essay_id = _essay_id(essay_uri) modified = g.value(essay_uri, DC.modified).toPython() essay = Essay(id=essay_id) essay.title = unicode(g.value(essay_uri, DC.title)).strip() essay.created = g.value(essay_uri, DC.created).toPython() essay.modified = g.value(essay_uri, DC.modified).toPython() essay.creator = _lookup_awardee((g.value(essay_uri, DC.creator))) essay.html = unicode(g.value(essay_uri, DC.description)) essay.essay_editor_url = essay_url essay.save() # so we can assign titles # attach any titles that the essay is about for title_uri in g.objects(essay_uri, DC.subject): lccn = _lccn_from_title_uri(title_uri) # load titles from web if not available try: title = Title.objects.get(lccn=lccn) except Exception, e: management.call_command('load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn) title = Title.objects.get(lccn=lccn) # attach the title to the essay essay.titles.add(title) # index the title in solr if necessary if index: index_title(title)