Beispiel #1
0
def load_essay(essay_url, index=True, session=None):
    """
    Load an essay from an RDFa HTML document.
    """

    if session is None:
        session = requests.Session()

    # extract metadata from the html
    LOGGER.info("loading essay %s", essay_url)

    # create the essay instance
    url_parts = urlparse.urlparse(essay_url)
    essay_id = url_parts[2].split("/")[2]

    response = session.get(essay_url)
    response.raise_for_status()
    doc = BeautifulSoup(response.text, "html.parser")

    essay = Essay(id=essay_id)
    essay.title = doc.title.text.strip()
    essay.created = doc.find_all(property="dcterms:created")[0]["content"]
    essay.modified = doc.find_all(property="dcterms:modified")[0]["content"]
    essay.creator = _lookup_awardee(
        doc.find_all(property="dcterms:creator")[0]["content"])
    description = doc.find_all(property="dcterms:description")[0]
    description = "".join(map(str, description.contents))
    essay.html = description
    essay.essay_editor_url = essay_url
    essay.save()  # so we can assign titles

    # attach any titles that the essay is about
    for title_uri in doc.find_all(property="dcterms:subject"):
        lccn = _lccn_from_title_uri(title_uri["content"])

        # load titles from web if not available
        try:
            title = Title.objects.get(lccn=lccn)
        except Title.DoesNotExist:
            management.call_command(
                "load_titles",
                "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn)
            title = Title.objects.get(lccn=lccn)

        # attach the title to the essay
        essay.titles.add(title)

        # index the title in solr if necessary
        if index:
            index_title(title)

    LOGGER.info("loaded essay: %s", essay_url)
    return essay
Beispiel #2
0
def load_essay(essay_url, index=True):
    """
    Load an essay from an RDFa HTML document.
    """
    # extract metadata from the html
    LOGGER.info("loading essay %s", essay_url)

    # create the essay instance
    url_parts = urlparse.urlparse(essay_url)
    essay_id = url_parts[2].split("/")[2]

    r = requests.get(essay_url)
    doc = BeautifulSoup(r.text, 'html.parser')

    essay = Essay(id=essay_id)
    essay.title = doc.title.text.strip()
    essay.created = doc.find_all(property="dcterms:created")[0]['content']
    essay.modified = doc.find_all(property="dcterms:modified")[0]['content']
    essay.creator = _lookup_awardee(
        doc.find_all(property="dcterms:creator")[0]['content'])
    description = doc.find_all(property="dcterms:description")[0]
    description = ''.join(map(str, description.contents))
    essay.html = description
    essay.essay_editor_url = essay_url
    essay.save()  # so we can assign titles

    # attach any titles that the essay is about
    for title_uri in doc.find_all(property="dcterms:subject"):
        lccn = _lccn_from_title_uri(title_uri['content'])

        # load titles from web if not available
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception:  # FIXME: this should only handle expected exceptions
            management.call_command(
                'load_titles',
                'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn)
            title = Title.objects.get(lccn=lccn)

        # attach the title to the essay
        essay.titles.add(title)

        # index the title in solr if necessary
        if index:
            index_title(title)

    LOGGER.info("loaded essay: %s", essay_url)
    return essay
Beispiel #3
0
def purge_essay(essay_url, index=True):
    """
    Purge an essay from the database.
    """
    try:
        essay = Essay.objects.get(essay_editor_url=essay_url)
        titles = list(essay.titles.all())
        essay.delete()
        LOGGER.info("deleted essay %s", essay_url)

        # reindex titles
        if index:
            for title in titles:
                index_title(title)

    except Essay.DoesNotExist:
        raise Exception("No such essay loaded from %s" % essay_url)
Beispiel #4
0
def purge_essay(essay_url, index=True):
    """
    Purge an essay from the database.
    """
    try:
        essay = Essay.objects.get(essay_editor_url=essay_url)
        titles = list(essay.titles.all())
        essay.delete()
        logging.info("deleted essay %s" % essay_url)

        # reindex titles
        if index:
            for title in titles:
                index_title(title)

    except Essay.DoesNotExist:
        raise Exception("No such essay loaded from %s" % essay_url)
def load_essay(essay_url, index=True):
    """
    Load an essay from an RDFa HTML document.
    """
    # extract metadata from the html
    LOGGER.info("loading essay %s" % essay_url)

    # create the essay instance
    url_parts = urlparse.urlparse(essay_url)
    essay_id = url_parts[2].split("/")[2]

    r = requests.get(essay_url)
    doc = BeautifulSoup(r.text, 'html.parser')

    essay = Essay(id=essay_id)
    essay.title = doc.title.text.strip()
    essay.created = doc.find_all(property="dcterms:created")[0]['content']
    essay.modified = doc.find_all(property="dcterms:modified")[0]['content']
    essay.creator = _lookup_awardee(doc.find_all(property="dcterms:creator")[0]['content'])
    description = doc.find_all(property="dcterms:description")[0]
    description = ''.join(map(str, description.contents))
    essay.html = description
    essay.essay_editor_url = essay_url
    essay.save()  # so we can assign titles

    # attach any titles that the essay is about
    for title_uri in doc.find_all(property="dcterms:subject"):
        lccn = _lccn_from_title_uri(title_uri['content'])

        # load titles from web if not available
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception:  # FIXME: this should only handle expected exceptions
            management.call_command('load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn)
            title = Title.objects.get(lccn=lccn)

        # attach the title to the essay
        essay.titles.add(title)

        # index the title in solr if necessary
        if index:
            index_title(title)

    LOGGER.info("loaded essay: %s" % essay_url)
    return essay
def load_essay(essay_url, index=True):
    """
    Load an essay from an RDFa HTML document.
    """
    # extract metadata from the html
    logging.info("loading essay %s" % essay_url)
    g = Graph()
    g.parse(essay_url, format='rdfa', html5=True, encoding='utf-8')

    # create the essay instance
    essay_uri = URIRef(essay_url)
    essay_id = _essay_id(essay_uri)
    modified = g.value(essay_uri, DC.modified).toPython()

    essay = Essay(id=essay_id)
    essay.title = unicode(g.value(essay_uri, DC.title)).strip()
    essay.created = g.value(essay_uri, DC.created).toPython()
    essay.modified = g.value(essay_uri, DC.modified).toPython()
    essay.creator = _lookup_awardee((g.value(essay_uri, DC.creator)))
    essay.html = unicode(g.value(essay_uri, DC.description))
    essay.essay_editor_url = essay_url
    essay.save()  # so we can assign titles

    # attach any titles that the essay is about
    for title_uri in g.objects(essay_uri, DC.subject):
        lccn = _lccn_from_title_uri(title_uri)

        # load titles from web if not available
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception, e:
            management.call_command(
                'load_titles',
                'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn)
            title = Title.objects.get(lccn=lccn)

        # attach the title to the essay
        essay.titles.add(title)

        # index the title in solr if necessary
        if index:
            index_title(title)
Beispiel #7
0
def load_essay(essay_url, index=True):
    """
    Load an essay from an RDFa HTML document.
    """
    # extract metadata from the html
    logging.info("loading essay %s" % essay_url)
    g = Graph()
    g.parse(essay_url, format='rdfa')

    # create the essay instance
    essay_uri = URIRef(essay_url)
    essay_id = _essay_id(essay_uri)
    modified = g.value(essay_uri, DC.modified).toPython()

    essay = Essay(id=essay_id)
    essay.title = unicode(g.value(essay_uri, DC.title)).strip()
    essay.created = g.value(essay_uri, DC.created).toPython()
    essay.modified = g.value(essay_uri, DC.modified).toPython()
    essay.creator = _lookup_awardee((g.value(essay_uri, DC.creator)))
    essay.html = unicode(g.value(essay_uri, DC.description))
    essay.essay_editor_url = essay_url
    essay.save()  # so we can assign titles

    # attach any titles that the essay is about
    for title_uri in g.objects(essay_uri, DC.subject):
        lccn = _lccn_from_title_uri(title_uri)

        # load titles from web if not available
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception, e:
            management.call_command('load_titles', 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn)
            title = Title.objects.get(lccn=lccn)

        # attach the title to the essay
        essay.titles.add(title)

        # index the title in solr if necessary
        if index:
            index_title(title)