Esempio n. 1
0
    def import_student_researchpage(self, studentpage, element):
        errors = {}

        # Get page info
        page_contentid = element.attrib["contentid"]
        page_title, errors["title"] = text_from_elem(element,
                                                     "title",
                                                     length=255,
                                                     textify=True)
        page_texts, errors["texts"] = self.import_texts(element.find("texts"))

        # Create research item
        try:
            researchitem = ResearchItem.objects.get(
                rca_content_id=page_contentid)
        except ResearchItem.DoesNotExist:
            researchitem = ResearchItem()
            researchitem.rca_content_id = page_contentid
        researchitem.title = page_title
        researchitem.research_type = "student"
        researchitem.description = page_texts
        researchitem.school = studentpage.school
        researchitem.programme = studentpage.programme
        researchitem.slug = make_slug(researchitem)

        if self.save:
            if researchitem.id:
                researchitem.save()
            else:
                self.research_index_page.add_child(researchitem)

            # Link to creator
            ResearchItemCreator.objects.get_or_create(page=researchitem,
                                                      person=studentpage)

        # Get carousel images
        images_element = element.find("images")
        if images_element is not None:
            for image in images_element.findall("image"):
                # Import the image
                theimage, error = self.import_image(image)

                # Add to carousel
                if theimage is not None and self.save:
                    ResearchItemCarouselItem.objects.get_or_create(
                        page=researchitem, image=theimage)

        return errors
Esempio n. 2
0
    def import_student_researchpage(self, studentpage, element):
        errors = {}

        # Get page info
        page_contentid = element.attrib["contentid"]
        page_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True)
        page_texts, errors["texts"] = self.import_texts(element.find("texts"))

        # Create research item
        try:
            researchitem = ResearchItem.objects.get(rca_content_id=page_contentid)
        except ResearchItem.DoesNotExist:
            researchitem = ResearchItem()
            researchitem.rca_content_id = page_contentid
        researchitem.title = page_title
        researchitem.research_type = "student"
        researchitem.description = page_texts
        researchitem.school = studentpage.school
        researchitem.programme = studentpage.programme
        researchitem.slug = make_slug(researchitem)

        if self.save:
            if researchitem.id:
                researchitem.save()
            else:
                self.research_index_page.add_child(researchitem)

            # Link to creator
            ResearchItemCreator.objects.get_or_create(page=researchitem, person=studentpage)

        # Get carousel images
        images_element = element.find("images")
        if images_element is not None:
            for image in images_element.findall("image"):
                # Import the image
                theimage, error = self.import_image(image)

                # Add to carousel
                if theimage is not None and self.save:
                    ResearchItemCarouselItem.objects.get_or_create(page=researchitem, image=theimage)

        return errors
Esempio n. 3
0
    def import_researchitem(self, researchitem):
        # Get basic info
        researchitem_eprintid = researchitem["eprintid"]
        researchitem_title = researchitem["title"]
        researchitem_abstract = researchitem.get("abstract", "")
        researchitem_type = researchitem["type"]
        researchitem_department = researchitem.get("department", "")
        researchitem_divisions = researchitem.get("divisions", [])

        # Get year
        if "date" in researchitem:
            # First 4 characters are always the year
            researchitem_year = str(researchitem["date"])[:4]
        elif "datestamp" in researchitem:
            # First 4 characters are always the year
            researchitem_year = str(researchitem["datestamp"])[:4]
        else:
            print "NO DATE"
            researchitem_year = ""

        # Get school
        researchitem_school = ""
        for division in researchitem_divisions:
            if division in DIVISION_SCHOOL_MAPPING:
                researchitem_school = DIVISION_SCHOOL_MAPPING[division]
                break

        # Convert description to HTML
        researchitem_abstract = text_to_html(researchitem_abstract)

        # Subtitle
        if researchitem_type == 'book_section':
            researchitem_subtitle = researchitem['book_title']
        elif researchitem_type == 'conference_item':
            researchitem_subtitle = researchitem['event_title']
        else:
            researchitem_subtitle = ''

        # Create researchitem page
        try:
            researchitempage = ResearchItem.objects.get(eprintid=researchitem_eprintid)

            # Find latest revision of researchitem
            researchitem_latest_revision = researchitempage.get_latest_revision_as_page()
        except ResearchItem.DoesNotExist:
            researchitempage = ResearchItem(eprintid=researchitem_eprintid)
            researchitem_latest_revision = None

        # Set values
        researchitempage.title = researchitem_title
        researchitempage.subtitle = researchitem_subtitle
        researchitempage.ref = True
        researchitempage.research_type = "staff"
        researchitempage.year = researchitem_year
        researchitempage.description = researchitem_abstract
        researchitempage.work_type = WORK_TYPES_CHOICES[researchitem_type]
        researchitempage.school = researchitem_school
        researchitempage.show_on_homepage = False
        researchitempage.slug = make_slug(researchitempage)

        # Save researchitem
        if self.save:
            if researchitempage.id:
                researchitempage.save()
            else:
                self.research_index_page.add_child(researchitempage)

        # Update latest revision
        if researchitem_latest_revision is not None:
            researchitem_latest_revision.title = researchitem_title
            researchitem_latest_revision.subtitle = researchitem_subtitle
            researchitem_latest_revision.ref = True
            researchitem_latest_revision.research_type = "staff"
            researchitem_latest_revision.year = researchitem_year
            researchitem_latest_revision.description = researchitem_abstract
            researchitem_latest_revision.work_type = WORK_TYPES_CHOICES[researchitem_type]
            researchitem_latest_revision.school = researchitem_school
            researchitem_latest_revision.show_on_homepage = False

            # Save latest revision
            if self.save:
                researchitem_latest_revision.save_revision()

        # Link creators
        if self.link_creators:
            for creator in researchitem["creators"]:
                creator_name = creator["name"]["given"] + " " + creator["name"]["family"]
                self.add_researchitemcreator(researchitempage, creator_name)
Esempio n. 4
0
def doimport(**kwargs):
    path = kwargs.get('path', PATH)
    save = kwargs.get('save', False)
    image_path = kwargs.get('image_path', IMAGE_PATH)
    ruthless = kwargs.get('ruthless', False)
    newsindex = NEWS_INDEX
    tree = ET.parse(path)
    root = tree.getroot()
    errors = []
    images_errors = []
    for item in root.findall('news_item'):
        itemerrors = {}

        # sort out what instance this is
        news_contentid = item.attrib['contentid']
        title, itemerrors['title'] = text_from_elem(item, 'title', length=255)
        date = parse_date(item.find('goinglivedate').text.strip().replace('.','-')) or datetime.date.today()
        try:
            newsitem = NewsItem.objects.get(rca_content_id=news_contentid)
        except NewsItem.DoesNotExist:
            newsitem = NewsItem(rca_content_id=news_contentid)
        newsitem.title = title
        newsitem.date = date
        newsitem.intro = richtext_from_elem(item.find('intro'))
        newsitem.slug = make_slug(newsitem)

        # possibly delete any images that are embedded in the existing body
        if ruthless:
            soup = BeautifulSoup(newsitem.body, 'html.parser')
            to_delete_ids = []
            for x in soup.find_all('embed'):
                try:
                    to_delete_ids.append(int(x.attrs['id']))
                except ValueError:
                    pass
            if to_delete_ids:
                RcaImage.objects.filter(id__in=to_delete_ids).delete()

        # build the body
        strings = []
        if item.find('texts'):
            for elem in item.find('texts').findall('text'):
                html = richtext_from_elem(elem.find('content'))
                strings.append(html)
        newsitem.body = '\n'.join(strings)

        # save newsitem
        if save:
            if newsitem.id:
                newsitem.save()
            else:
                newsindex.add_child(newsitem)

        tobesaved = False
        if item.find('images') is not None:
            # first delete images that haven't got a contentid
            if ruthless:
                for c in NewsItemCarouselItem.objects.filter(page=newsitem):
                    c.image.delete()
                    c.delete()

            for image in item.find('images').findall('image'):
                imageerrors = {}
                metadata = image.find('imagemetadata')
                im_contentid = image.attrib['contentid']
                filename = urllib2.unquote(image.find('filename').text.strip())
                try:
                    theimage = RcaImage.objects.get(rca_content_id=im_contentid)
                except RcaImage.DoesNotExist:
                    theimage = RcaImage(rca_content_id=im_contentid)

                theimage.title, imageerrors['title'] = text_from_elem(metadata, 'title', length=255, textify=True)
                theimage.creator, imageerrors['creator'] = text_from_elem(metadata, 'creator', length=255, textify=True)
                theimage.medium, imageerrors['medium'] = text_from_elem(metadata, 'media', length=255, textify=True)
                theimage.photographer, imageerrors['photog'] = text_from_elem(metadata, 'photographer', length=255, textify=True)
                theimage.permission, imageerrors['perms'] = text_from_elem(metadata, 'rights', length=255, textify=True)

                caption, imageerrors['caption'] = text_from_elem(metadata, 'caption', length=255, textify=True)
                theimage.alt = caption

                #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255)
                #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255)

                try:
                    with File(open(image_path + filename.encode('utf-8'), 'r')) as f:
                        if theimage.id:
                            if save:
                                theimage.delete()
                        theimage.file = f
                        if save:
                            theimage.save()
                except IOError as e:
                    print "I/O error({0}): {1}".format(e.errno, e.strerror)
                    print repr(filename)
                except ValueError:
                    print "Could not convert data to an integer."
                except:
                    import sys
                    print "Unexpected error:", sys.exc_info()[0]
                    raise

                if save and theimage.is_landscape():
                    try:
                        carousel = NewsItemCarouselItem.objects.get(
                                page = newsitem,
                                image = theimage,
                                )
                    except NewsItemCarouselItem.DoesNotExist:
                        carousel = NewsItemCarouselItem(
                                page = newsitem,
                                image = theimage,
                                )
                        if save:
                            carousel.save()
                elif save and theimage.id:
                    imagestring = '<embed alt="%(alt)s" embedtype="image" format="right" id="%(id)s"/>' % {
                            'alt': theimage.alt,
                            'id': theimage.id,
                            }
                    newsitem.body = imagestring + newsitem.body
                    tobesaved = True

                imageerrordict = dict((k, v) for k, v in imageerrors.iteritems() if v)
                if imageerrordict:
                    images_errors.append({image: imageerrordict})
        if tobesaved and save:
            newsitem.save()

        errordict = dict((k, v) for k, v in itemerrors.iteritems() if v)
        if errordict:
            errors.append({item: errordict})
    return errors, images_errors
def import_staff(element):
    errors = {}

    # Basic info
    staff_contentid = element.attrib['contentid']
    staff_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True)
    staff_name, errors['name'] = text_from_elem(element, 'staffname', length=255, textify=True)
    staff_programme, errors['programme'] = text_from_elem(element, 'programme', length=255, textify=True)
    staff_statement, errors['statement'] = text_from_elem(element, 'statement')
    staff_biography, errors['biography'] = text_from_elem(element, 'biography')
    staff_school, errors['school'] = text_from_elem(element, 'school', length=255, textify=True)
    staff_editorialreference, errors['editorialreference'] = text_from_elem(element, 'editorialreference', length=255, textify=True)

    # Emails
    emails_element = element.find('emails')
    if emails_element is not None:
        staff_emails = [email.text for email in emails_element.findall('email')]
    else:
        staff_emails = []

    # URLs
    urls_element = element.find('urls')
    if urls_element is not None:
        staff_urls = [url.text for url in urls_element.findall('url')]
    else:
        staff_urls = []

    # Supervised students
    staff_supervisedstudents = []
    supervisedstudents_element = element.find('supervisedstudents')
    if supervisedstudents_element is not None:
        for supervisedstudent in supervisedstudents_element.findall('supervisedstudent'):
            supervised_student = supervisedstudent.text
            if supervised_student is not None:
                staff_supervisedstudents.append(supervisedstudent.text)



    # Cleanup statement and biography
    staff_statement = cleanup_html(staff_statement)
    staff_biography = cleanup_html(staff_biography)

    # Split name into first name, last name and title
    # A L Rees needs to be split up manually
    if staff_name == "A L Rees":
        staff_titleprefix = ""
        staff_firstname = "A L"
        staff_lastname = "Rees"
    else:
        name_split = staff_name.split()
        if name_split[0] == "Professor" or name_split[0] == "Dr" or name_split[0] == "Sir":
            staff_titleprefix = name_split[0]
            staff_firstname = name_split[1]
            staff_lastname = " ".join(name_split[2:])
        else:
            staff_titleprefix = ""
            staff_firstname = " ".join(name_split[:1])
            staff_lastname = " ".join(name_split[1:])

        # Remove "Programme" from staff_programme if it is there
        staff_programme_split = staff_programme.split()
        if staff_programme_split[-1] == "Programme" or staff_programme_split[-1] == "Programmes":
            staff_programme = " ".join(staff_programme_split[:-1])

    # Slugs
    staff_programme_slug = constants.PROGRAMMES.get(staff_programme, "")
    staff_school_slug = constants.SCHOOLS.get(staff_programme, "")



    # Create page for staff member
    try:
        staffpage = StaffPage.objects.get(rca_content_id=staff_contentid)
    except StaffPage.DoesNotExist:
        staffpage = StaffPage()
        staffpage.rca_content_id = staff_contentid
    staffpage.title = staff_name
    staffpage.school = staff_school_slug
    staffpage.staff_type = "academic"
    staffpage.intro = staff_statement
    staffpage.biography = staff_biography
    staffpage.show_on_homepage = False
    staffpage.show_on_programme_page = False
    staffpage.title_prefix = staff_titleprefix
    staffpage.first_name = staff_firstname
    staffpage.last_name = staff_lastname
    if len(staff_supervisedstudents) > 0:
        staffpage.supervised_student_other = ", ".join(staff_supervisedstudents)
    staffpage.slug = make_slug(staffpage)
    if staffpage.id:
        staffpage.save()
    else:
        STAFF_INDEX_PAGE.add_child(staffpage)



    # Create role
    try:
        staffpagerole = StaffPageRole.objects.get(page=staffpage)
    except StaffPageRole.DoesNotExist:
        staffpagerole = StaffPageRole()
        staffpagerole.page=staffpage

    staffpagerole.title = staff_title
    staffpagerole.school = staff_school_slug
    staffpagerole.programme = staff_programme_slug
    if len(staff_emails) > 0:
        staffpagerole.email = staff_emails[0]
    staffpagerole.save()



    # Images
    images_element = element.find('images')
    if images_element is not None:
        for image in images_element.findall('image'):
            # Import the image
            theimage, error = import_image(image)

            # Add to carousel
            if theimage is not None:
                StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage)



    # Research pages
    researchpages_element = element.find('researchpages')
    if researchpages_element is not None:
        for researchpage in researchpages_element.findall('page'):
            import_staff_researchpage(staffpage, researchpage)

        # Research child pages
        research_childpages_element = researchpages_element.find('childpages')
        if research_childpages_element is not None:
            for childpage in research_childpages_element.findall('page'):
                import_staff_researchpage(staffpage, childpage)



    # Append URLs to bottom of practise block
    urls_html = "<ul>"
    for url in staff_urls:
        url_valid = url
        if "://" not in url_valid:
            url_valid = "http://" + url_valid
        urls_html += "<li><a href=\"" + url_valid + "\">" + url + "</a></li>"
    urls_html += "</ul>"
    staffpage.practice += urls_html



    # Resave page
    staffpage.save()

    return errors
def import_staff_researchpage(staffpage, element):
    errors = {}

    # Get page info
    page_contentid = element.attrib['contentid']
    page_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True)
    page_texts, errors['texts'] = import_texts(element.find('texts'))

    # Check if this is an interesting page
    if page_title in interesting_pages:
        # Set the field for this page
        setattr(staffpage, interesting_pages[page_title], page_texts)

        # Get carousel images
        images_element = element.find('images')
        if images_element is not None:
            for image in images_element.findall('image'):
                # Import the image
                theimage, error = import_image(image)

                # Add to carousel
                if theimage is not None:
                    StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage)

    else:
        # Get school and programme from staffpage role
        try:
            staffpagerole = StaffPageRole.objects.get(page=staffpage)
            school = staffpagerole.school
            programme = staffpagerole.programme
        except StaffPageRole.DoesNotExist:
            school = ""
            programme = ""

        # Create research item
        try:
            researchitem = ResearchItem.objects.get(rca_content_id=page_contentid)
        except ResearchItem.DoesNotExist:
            researchitem = ResearchItem()
            researchitem.rca_content_id = page_contentid
        researchitem.title = page_title
        researchitem.research_type = "staff"
        researchitem.description = page_texts
        researchitem.school = school
        researchitem.programme = programme
        researchitem.slug = make_slug(researchitem)

        if researchitem.id:
            researchitem.save()
        else:
            RESEARCH_INDEX_PAGE.add_child(researchitem)

        # Link to creator
        ResearchItemCreator.objects.get_or_create(page=researchitem, person=staffpage)

        # Get carousel images
        images_element = element.find('images')
        if images_element is not None:
            for image in images_element.findall('image'):
                # Import the image
                theimage, error = import_image(image)

                # Add to carousel
                if theimage is not None:
                    ResearchItemCarouselItem.objects.get_or_create(page=researchitem, image=theimage)

    return errors
Esempio n. 7
0
    def import_student(self, element):
        errors = {}

        # Basic info
        student_contentid = element.attrib["contentid"]
        student_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True)
        student_name, errors["name"] = text_from_elem(element, "staffname", length=255, textify=True)
        student_programme, errors["programme"] = text_from_elem(element, "programme", length=255, textify=True)
        student_biography, errors["biography"] = text_from_elem(element, "biography")
        student_school, errors["school"] = text_from_elem(element, "school", length=255, textify=True)
        student_editorialreference, errors["editorialreference"] = text_from_elem(element, "editorialreference", length=255, textify=True)

        # If name is in ignore list, skip it
        if student_name in IGNORED_NAMES:
            return

        # Emails
        emails_element = element.find("emails")
        if emails_element is not None:
            student_emails = [email.text for email in emails_element.findall("email")]
        else:
            student_emails = None

        # URLs
        urls_element = element.find("urls")
        if urls_element is not None:
            student_urls = [url.text for url in urls_element.findall("url")]
        else:
            student_urls = None

        # Supervisor
        student_supervisor = None
        supervisedstudents_element = element.find("supervisedstudents")
        if supervisedstudents_element is not None:
            student_supervisor_name = supervisedstudents_element.find("supervisedstudent").text

            # Get page for supervisor
            student_supervisor = self.find_staff_page(student_supervisor_name)

        # Cleanup  biography
        student_biography = cleanup_html(student_biography)

        # Split name into first name, last name and title
        name_split = student_name.split()
        student_firstname = " ".join(name_split[:1])
        student_lastname = " ".join(name_split[1:])

        # Remove "Programme" from student_programme if it is there
        student_programme_split = student_programme.split()
        if len(student_programme_split) > 0:
            if student_programme_split[-1] == "Programme" or student_programme_split[-1] == "Programmes":
                student_programme = " ".join(student_programme_split[:-1])

        # Remove "\r" from beginning of student_programme if it is there
        if student_programme and student_programme[:2] == "\\r":
            student_programme = student_programme[2:]

        # Remove \n from beginning and end of student_school if it is there
        if student_school and student_school[:2] == "\\n":
            student_school = student_school[2:]
        if student_school and student_school[-2:] == "\\n":
            student_school = student_school[:-2]

        # If student is in STUDENT_PROGRAMMES list, then use the programme set there
        if student_name in STUDENT_PROGRAMMES:
            student_programme = STUDENT_PROGRAMMES[student_name]

        # Slugs
        student_programme_slug = constants.PROGRAMMES.get(student_programme, "")
        student_school_slug = constants.SCHOOLS.get(student_school, "")
        degree_subject_slug = constants.DEGREE_SUBJECTS.get(student_programme, "")


        # Create page for student
        try:
            studentpage = StudentPage.objects.get(rca_content_id=student_contentid)
        except StudentPage.DoesNotExist:
            studentpage = StudentPage()
            studentpage.rca_content_id = student_contentid
        studentpage.title = student_name
        studentpage.school = student_school_slug
        studentpage.programme = student_programme_slug
        studentpage.degree_qualification = "researchstudent"
        studentpage.degree_subject = degree_subject_slug
        studentpage.degree_year = ""
        studentpage.statement = student_biography
        studentpage.funding = student_title
        studentpage.show_on_homepage = False
        studentpage.show_on_programme_page = False
        studentpage.first_name = student_firstname
        studentpage.last_name = student_lastname
        studentpage.supervisor = student_supervisor
        studentpage.slug = make_slug(studentpage)
        if self.save:
            if studentpage.id:
                studentpage.save()
            else:
                self.student_index_page.add_child(studentpage)



        # Emails
        if student_emails is not None:
            for email in student_emails:
                StudentPageContactsEmail.objects.get_or_create(page=studentpage, email=email)

        # URLS
        if student_urls is not None:
            for url in student_urls:
                StudentPageContactsWebsite.objects.get_or_create(page=studentpage, website=url)


        # Images
        images_element = element.find("images")
        if images_element is not None:
            for image in images_element.findall("image"):
                # Import the image
                theimage, error = self.import_image(image)

                if theimage is not None and self.save:
                    # Add to carousel
                    StudentPageCarouselItem.objects.get_or_create(page=studentpage, image=theimage)



        # Research pages
        researchpages_element = element.find("researchpages")
        if researchpages_element is not None:
            for researchpage in researchpages_element.findall("page"):
                self.import_student_researchpage(studentpage, researchpage)

            # Research child pages
            research_childpages_element = researchpages_element.find("childpages")
            if research_childpages_element is not None:
                for childpage in research_childpages_element.findall("page"):
                    self.import_student_researchpage(studentpage, childpage)


        # Resave page
        if self.save:
            studentpage.save()
def import_staff(element):
    errors = {}

    # Basic info
    staff_contentid = element.attrib['contentid']
    staff_title, errors['title'] = text_from_elem(element,
                                                  'title',
                                                  length=255,
                                                  textify=True)
    staff_name, errors['name'] = text_from_elem(element,
                                                'staffname',
                                                length=255,
                                                textify=True)
    staff_programme, errors['programme'] = text_from_elem(element,
                                                          'programme',
                                                          length=255,
                                                          textify=True)
    staff_statement, errors['statement'] = text_from_elem(element, 'statement')
    staff_biography, errors['biography'] = text_from_elem(element, 'biography')
    staff_school, errors['school'] = text_from_elem(element,
                                                    'school',
                                                    length=255,
                                                    textify=True)
    staff_editorialreference, errors['editorialreference'] = text_from_elem(
        element, 'editorialreference', length=255, textify=True)

    # Emails
    emails_element = element.find('emails')
    if emails_element is not None:
        staff_emails = [
            email.text for email in emails_element.findall('email')
        ]
    else:
        staff_emails = []

    # URLs
    urls_element = element.find('urls')
    if urls_element is not None:
        staff_urls = [url.text for url in urls_element.findall('url')]
    else:
        staff_urls = []

    # Supervised students
    staff_supervisedstudents = []
    supervisedstudents_element = element.find('supervisedstudents')
    if supervisedstudents_element is not None:
        for supervisedstudent in supervisedstudents_element.findall(
                'supervisedstudent'):
            supervised_student = supervisedstudent.text
            if supervised_student is not None:
                staff_supervisedstudents.append(supervisedstudent.text)

    # Cleanup statement and biography
    staff_statement = cleanup_html(staff_statement)
    staff_biography = cleanup_html(staff_biography)

    # Split name into first name, last name and title
    # A L Rees needs to be split up manually
    if staff_name == "A L Rees":
        staff_titleprefix = ""
        staff_firstname = "A L"
        staff_lastname = "Rees"
    else:
        name_split = staff_name.split()
        if name_split[0] == "Professor" or name_split[0] == "Dr" or name_split[
                0] == "Sir":
            staff_titleprefix = name_split[0]
            staff_firstname = name_split[1]
            staff_lastname = " ".join(name_split[2:])
        else:
            staff_titleprefix = ""
            staff_firstname = " ".join(name_split[:1])
            staff_lastname = " ".join(name_split[1:])

        # Remove "Programme" from staff_programme if it is there
        staff_programme_split = staff_programme.split()
        if staff_programme_split[-1] == "Programme" or staff_programme_split[
                -1] == "Programmes":
            staff_programme = " ".join(staff_programme_split[:-1])

    # Slugs
    staff_programme_slug = constants.PROGRAMMES.get(staff_programme, "")
    staff_school_slug = constants.SCHOOLS.get(staff_programme, "")

    # Create page for staff member
    try:
        staffpage = StaffPage.objects.get(rca_content_id=staff_contentid)
    except StaffPage.DoesNotExist:
        staffpage = StaffPage()
        staffpage.rca_content_id = staff_contentid
    staffpage.title = staff_name
    staffpage.school = staff_school_slug
    staffpage.staff_type = "academic"
    staffpage.intro = staff_statement
    staffpage.biography = staff_biography
    staffpage.show_on_homepage = False
    staffpage.show_on_programme_page = False
    staffpage.title_prefix = staff_titleprefix
    staffpage.first_name = staff_firstname
    staffpage.last_name = staff_lastname
    if len(staff_supervisedstudents) > 0:
        staffpage.supervised_student_other = ", ".join(
            staff_supervisedstudents)
    staffpage.slug = make_slug(staffpage)
    if staffpage.id:
        staffpage.save()
    else:
        STAFF_INDEX_PAGE.add_child(staffpage)

    # Create role
    try:
        staffpagerole = StaffPageRole.objects.get(page=staffpage)
    except StaffPageRole.DoesNotExist:
        staffpagerole = StaffPageRole()
        staffpagerole.page = staffpage

    staffpagerole.title = staff_title
    staffpagerole.school = staff_school_slug
    staffpagerole.programme = staff_programme_slug
    if len(staff_emails) > 0:
        staffpagerole.email = staff_emails[0]
    staffpagerole.save()

    # Images
    images_element = element.find('images')
    if images_element is not None:
        for image in images_element.findall('image'):
            # Import the image
            theimage, error = import_image(image)

            # Add to carousel
            if theimage is not None:
                StaffPageCarouselItem.objects.get_or_create(page=staffpage,
                                                            image=theimage)

    # Research pages
    researchpages_element = element.find('researchpages')
    if researchpages_element is not None:
        for researchpage in researchpages_element.findall('page'):
            import_staff_researchpage(staffpage, researchpage)

        # Research child pages
        research_childpages_element = researchpages_element.find('childpages')
        if research_childpages_element is not None:
            for childpage in research_childpages_element.findall('page'):
                import_staff_researchpage(staffpage, childpage)

    # Append URLs to bottom of practise block
    urls_html = "<ul>"
    for url in staff_urls:
        url_valid = url
        if "://" not in url_valid:
            url_valid = "http://" + url_valid
        urls_html += "<li><a href=\"" + url_valid + "\">" + url + "</a></li>"
    urls_html += "</ul>"
    staffpage.practice += urls_html

    # Resave page
    staffpage.save()

    return errors
def import_staff_researchpage(staffpage, element):
    errors = {}

    # Get page info
    page_contentid = element.attrib['contentid']
    page_title, errors['title'] = text_from_elem(element,
                                                 'title',
                                                 length=255,
                                                 textify=True)
    page_texts, errors['texts'] = import_texts(element.find('texts'))

    # Check if this is an interesting page
    if page_title in interesting_pages:
        # Set the field for this page
        setattr(staffpage, interesting_pages[page_title], page_texts)

        # Get carousel images
        images_element = element.find('images')
        if images_element is not None:
            for image in images_element.findall('image'):
                # Import the image
                theimage, error = import_image(image)

                # Add to carousel
                if theimage is not None:
                    StaffPageCarouselItem.objects.get_or_create(page=staffpage,
                                                                image=theimage)

    else:
        # Get school and programme from staffpage role
        try:
            staffpagerole = StaffPageRole.objects.get(page=staffpage)
            school = staffpagerole.school
            programme = staffpagerole.programme
        except StaffPageRole.DoesNotExist:
            school = ""
            programme = ""

        # Create research item
        try:
            researchitem = ResearchItem.objects.get(
                rca_content_id=page_contentid)
        except ResearchItem.DoesNotExist:
            researchitem = ResearchItem()
            researchitem.rca_content_id = page_contentid
        researchitem.title = page_title
        researchitem.research_type = "staff"
        researchitem.description = page_texts
        researchitem.school = school
        researchitem.programme = programme
        researchitem.slug = make_slug(researchitem)

        if researchitem.id:
            researchitem.save()
        else:
            RESEARCH_INDEX_PAGE.add_child(researchitem)

        # Link to creator
        ResearchItemCreator.objects.get_or_create(page=researchitem,
                                                  person=staffpage)

        # Get carousel images
        images_element = element.find('images')
        if images_element is not None:
            for image in images_element.findall('image'):
                # Import the image
                theimage, error = import_image(image)

                # Add to carousel
                if theimage is not None:
                    ResearchItemCarouselItem.objects.get_or_create(
                        page=researchitem, image=theimage)

    return errors
Esempio n. 10
0
def doimport(**kwargs):
    path = kwargs.get('path', PATH)
    save = kwargs.get('save', False)
    image_path = kwargs.get('image_path', IMAGE_PATH)
    ruthless = kwargs.get('ruthless', False)
    newsindex = NEWS_INDEX
    tree = ET.parse(path)
    root = tree.getroot()
    errors = []
    images_errors = []
    for item in root.findall('news_item'):
        itemerrors = {}

        # sort out what instance this is
        news_contentid = item.attrib['contentid']
        title, itemerrors['title'] = text_from_elem(item, 'title', length=255)
        date = parse_date(
            item.find('goinglivedate').text.strip().replace(
                '.', '-')) or datetime.date.today()
        try:
            newsitem = NewsItem.objects.get(rca_content_id=news_contentid)
        except NewsItem.DoesNotExist:
            newsitem = NewsItem(rca_content_id=news_contentid)
        newsitem.title = title
        newsitem.date = date
        newsitem.intro = richtext_from_elem(item.find('intro'))
        newsitem.slug = make_slug(newsitem)

        # possibly delete any images that are embedded in the existing body
        if ruthless:
            soup = BeautifulSoup(newsitem.body, 'html.parser')
            to_delete_ids = []
            for x in soup.find_all('embed'):
                try:
                    to_delete_ids.append(int(x.attrs['id']))
                except ValueError:
                    pass
            if to_delete_ids:
                RcaImage.objects.filter(id__in=to_delete_ids).delete()

        # build the body
        strings = []
        if item.find('texts'):
            for elem in item.find('texts').findall('text'):
                html = richtext_from_elem(elem.find('content'))
                strings.append(html)
        newsitem.body = '\n'.join(strings)

        # save newsitem
        if save:
            if newsitem.id:
                newsitem.save()
            else:
                newsindex.add_child(newsitem)

        tobesaved = False
        if item.find('images') is not None:
            # first delete images that haven't got a contentid
            if ruthless:
                for c in NewsItemCarouselItem.objects.filter(page=newsitem):
                    c.image.delete()
                    c.delete()

            for image in item.find('images').findall('image'):
                imageerrors = {}
                metadata = image.find('imagemetadata')
                im_contentid = image.attrib['contentid']
                filename = urllib2.unquote(image.find('filename').text.strip())
                try:
                    theimage = RcaImage.objects.get(
                        rca_content_id=im_contentid)
                except RcaImage.DoesNotExist:
                    theimage = RcaImage(rca_content_id=im_contentid)

                theimage.title, imageerrors['title'] = text_from_elem(
                    metadata, 'title', length=255, textify=True)
                theimage.creator, imageerrors['creator'] = text_from_elem(
                    metadata, 'creator', length=255, textify=True)
                theimage.medium, imageerrors['medium'] = text_from_elem(
                    metadata, 'media', length=255, textify=True)
                theimage.photographer, imageerrors['photog'] = text_from_elem(
                    metadata, 'photographer', length=255, textify=True)
                theimage.permission, imageerrors['perms'] = text_from_elem(
                    metadata, 'rights', length=255, textify=True)

                caption, imageerrors['caption'] = text_from_elem(metadata,
                                                                 'caption',
                                                                 length=255,
                                                                 textify=True)
                theimage.alt = caption

                #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255)
                #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255)

                try:
                    with File(open(image_path + filename.encode('utf-8'),
                                   'r')) as f:
                        if theimage.id:
                            if save:
                                theimage.delete()
                        theimage.file = f
                        if save:
                            theimage.save()
                except IOError as e:
                    print "I/O error({0}): {1}".format(e.errno, e.strerror)
                    print repr(filename)
                except ValueError:
                    print "Could not convert data to an integer."
                except:
                    import sys
                    print "Unexpected error:", sys.exc_info()[0]
                    raise

                if save and theimage.is_landscape():
                    try:
                        carousel = NewsItemCarouselItem.objects.get(
                            page=newsitem,
                            image=theimage,
                        )
                    except NewsItemCarouselItem.DoesNotExist:
                        carousel = NewsItemCarouselItem(
                            page=newsitem,
                            image=theimage,
                        )
                        if save:
                            carousel.save()
                elif save and theimage.id:
                    imagestring = '<embed alt="%(alt)s" embedtype="image" format="right" id="%(id)s"/>' % {
                        'alt': theimage.alt,
                        'id': theimage.id,
                    }
                    newsitem.body = imagestring + newsitem.body
                    tobesaved = True

                imageerrordict = dict(
                    (k, v) for k, v in imageerrors.iteritems() if v)
                if imageerrordict:
                    images_errors.append({image: imageerrordict})
        if tobesaved and save:
            newsitem.save()

        errordict = dict((k, v) for k, v in itemerrors.iteritems() if v)
        if errordict:
            errors.append({item: errordict})
    return errors, images_errors
Esempio n. 11
0
def doimport(**kwargs):
    save = kwargs.get('save', False)
    path = kwargs.get('path', PATH)
    image_path = kwargs.get('image_path', IMAGE_PATH)
    show_index = SHOW_INDEX
    tree = ET.parse(path)
    root = tree.getroot()
    errors = {}
    images_errors = []
    dept_count = 0
    total_students = 0
    new_count = 0
    student_save_count = 0
    for d in root.findall('department'):
        dept_count += 1
        page = d.find('page')
        depterrors = {}
        dept_title, depterrors['title'] = text_from_elem(page, 'title')
        specialism = ''
        print '\nNow importing: ' + repr(dept_title)
        if dept_title in PROGRAMME_SPECIALISMS.keys():
            dept_title, specialism = PROGRAMME_SPECIALISMS[dept_title]
        print 'dept: ' + repr(dept_title)
        theprogramme = PROGRAMMES[dept_title]
        print 'prog: ' + repr(theprogramme)
        theschool = SCHOOLS[dept_title]
        print 'scho: ' + repr(theschool)

        h = html2text.HTML2Text()
        h.body_width = 0
        try:
            blurb = page.find('texts').findall('text')[0].find('content')
        except AttributeError:
            blurb = page.find('synopsis')
        blurb = h.handle(blurb.text).strip()
        print "Blurb: " + repr(blurb)
        print "******* note that the above text will not be imported *******"

        student_count = 0

        for s in d.findall('student'):
            student_count += 1
            s = s.find('studentpage')
            sp_contentid = s.attrib['contentid']
            try:
                sp = StudentPage.objects.get(rca_content_id=sp_contentid)
            except StudentPage.DoesNotExist:
                sp = StudentPage(rca_content_id=sp_contentid)
            sp_errs = {}

            sp.title, sp_errs['title'] = text_from_elem(s, 'title', length=255)
            # there is no intro text in any of the data at time of writing
            # intro, sp_errs['intro'] = text_from_elem(s, 'intro')
            sp.slug = make_slug(sp)
            statement = richtext_from_elem(s.find('statement'))

            statement_text, sponsors, collaborators = statement_extract(statement)
            sp.statement = statement_text
            sp.work_description = statement_text

            # handle the metadata fields
            metadata = s.find('metadata')
            # format the current degree
            sp.degree_year, sp_errs['deg_year'] = text_from_elem(metadata, 'year', length=255)
            degree_subject, sp_errs['deg_subj'] = text_from_elem(metadata, 'degrees', length=255)
            if degree_subject[-1] == '?':
                degree_subject = degree_subject[:-1]
            sp.degree_subject = DEGREE_SUBJECTS[degree_subject]
            degree_qualification, sp_errs['deg_qual'] = text_from_elem(metadata, 'degree', length=255)
            sp.degree_qualification = degree_qualification.lower()
            # metadata contains first and last names in separate fields
            sp.first_name, sp_errs['first_name'] = text_from_elem(metadata, 'firstname', length=255)
            sp.last_name, sp_errs['last_name'] = text_from_elem(metadata, 'surname', length=255)
            # we worked out the programme and school earlier from the dept_page
            sp.programme = theprogramme
            sp.school = theschool
            if not specialism and metadata.find('specialism') is not None:
                sp.specialism, sp_errs['specialism'] = text_from_elem(metadata, 'specialism')
            else:
                sp.specialism = specialism
            # handle profile image
            try:
                profile_image = RcaImage.objects.get(rca_content_id=sp_contentid + 'profile_image')
            except RcaImage.DoesNotExist:
                profile_image = RcaImage(rca_content_id=sp_contentid + 'profile_image')
            profile_filename = slugify(unicode(sp.title)).replace('-','_')
            profile_image_path = image_path + "show_2013_profiles/2400_" + sp.programme + "/"
            profile_image.title = sp.title + ' profile image'
            if not profile_image.id:
                try:
                    with File(open(normalize("NFKD", profile_image_path + profile_filename + '.jpg'), 'rb')) as f:
                        profile_image.file = f
                        if save:
                            profile_image.save()
                except IOError as e:
                    try:
                        with File(open(normalize("NFKD", profile_image_path + profile_filename + '.png'), 'rb')) as f:
                            profile_image.file = f
                            if save:
                                profile_image.save()
                    except IOError as e:
                        print "I/O error({0}): {1}".format(e.errno, e.strerror) + " " + profile_image_path + profile_filename
                        sp_errs['image_not_found'] = profile_image_path + profile_filename
                except ValueError:
                    print "Could not convert data to an integer."
                except:
                    import sys
                    print "Unexpected error:", sys.exc_info()[0]
                    raise
            else:
                if save:
                    profile_image.save()
            sp.profile_image = profile_image

            # save the studentpage for foreignkey purposes
            if save:
                student_save_count += 1
                if sp.id:
                    sp.save()
                else:
                    new_count += 1
                    show_index.add_child(sp)
            elif not sp.id:
                new_count += 1

            # handle the sponsors and collaborators from earlier
            for spon in sponsors:
                name, sp_errs['sponsors'] = check_length(spon, 255)
                if save:
                    sponpage = StudentPageWorkSponsor(page=sp, name=name)
                    sponpage.save()
            for col in collaborators:
                name, sp_errs['collaborators'] = check_length(col, 255)
                if save:
                    colpage = StudentPageWorkCollaborator(page=sp, name=name)
                    colpage.save()

            # handle the cv fields
            cv = s.find('cv')

            sp_errs['degree'] = cv_handle(
                    cv, 'degrees', StudentPageDegree, sp, length=255, fieldname='degree', save=save)
            sp_errs['exhibition'] = cv_handle(
                    cv, 'exhibition', StudentPageExhibition, sp, length=255, save=save)
            sp_errs['experience'] = cv_handle(
                    cv, 'experience', StudentPageExperience, sp, length=255, save=save)
            sp_errs['awards'] = cv_handle(
                    cv, 'awards', StudentPageAwards, sp, length=255, fieldname='award', save=save)
            if cv.find('sponsors') is not None:
                sp_errs['sponsors'] = cv_handle(
                        cv, 'sponsors', StudentPageWorkSponsor, sp, length=255, fieldname='name', save=save)
            # currently the model doesn't have publications or conferences
            #sp_errs['publications'] = cv_handle(
            #        cv, 'publications', StudentPagePublications, sp, length=255)
            #sp_errs['conferences'] = cv_handle(
            #        cv, 'conferences', StudentPageConferences, sp, length=255)
            
            if s.find('emails') is not None:
                for emailaddress in s.find('emails').getchildren():
                    emailtext = emailaddress.text.strip()
                    if save:
                        StudentPageContactsEmail.objects.get_or_create(page=sp, email=emailtext)

            if s.find('phonenumbers') is not None:
                for num in s.find('phonenumbers').getchildren():
                    if num.text:
                        phonenumber = num.text.strip()
                        if save:
                            StudentPageContactsPhone.objects.get_or_create(page=sp, phone=phonenumber)

            if s.find('urls') is not None:
                for url in s.find('urls').getchildren():
                    if url.text:
                        urltext = url.text.strip()
                        if save:
                            StudentPageContactsWebsite.objects.get_or_create(page=sp, website=urltext)

            # handle images tag
            images = s.find('images')
            forloop_counter = 0
            if images is not None:
                for image in images.findall('image'):
                    forloop_counter += 1
                    imageerrors = {}
                    metadata = image.find('imagemetadata')
                    im_contentid = image.attrib['contentid']
                    if not im_contentid:
                        im_contentid = sp_contentid + '_image_' + str(forloop_counter)
                    try:
                        theimage = RcaImage.objects.get(rca_content_id=im_contentid)
                    except RcaImage.DoesNotExist:
                        theimage = RcaImage(rca_content_id=im_contentid)
                    theimage.title, imageerrors['title'] = text_from_elem(metadata, 'title', length=255, textify=True)
                    theimage.creator, imageerrors['creator'] = text_from_elem(metadata, 'creator', length=255, textify=True)
                    theimage.medium, imageerrors['medium'] = text_from_elem(metadata, 'media', length=255, textify=True)
                    photographer, imageerrors['photographer'] = text_from_elem(metadata, 'photographer', length=255)
                    if photographer.strip().startswith('&copy;'):
                        photographer = photographer.replace('&copy;', '').strip()
                    theimage.photographer = photographer
                    theimage.permissions, imageerrors['permissions'] = text_from_elem(metadata, 'rights', length=255)

                    caption, imageerrors['caption'] = text_from_elem(metadata, 'caption', length=255, textify=True)
                    theimage.alt = caption
                    

                    #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255)
                    #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255)

                    filename = unicode(urllib2.unquote(image.find('filename').text.strip()))
                    image_success = False
                    full_image_path = image_path + '2400_' + sp.programme + "/"
                    if not theimage.id:
                        try:
                            with File(open(normalize("NFKD", full_image_path + filename), 'rb')) as f:
                                theimage.file = f
                                if save:
                                    theimage.save()
                                    image_success = True
                        except IOError as e:
                            try:
                                with File(open(normalize("NFKD", full_image_path + filename[:-4] + '.png'), 'rb')) as f:
                                    theimage.file = f
                                    if save:
                                        theimage.save()
                                        image_success = True
                            except IOError as e:
                                print "I/O error({0}): {1}".format(e.errno, e.strerror)
                                print full_image_path + filename
                                imageerrors['image_not_found'] = full_image_path + filename
                        except ValueError:
                            print "Could not convert data to an integer."
                        except:
                            import sys
                            print "Unexpected error:", sys.exc_info()[0]
                            raise
                    else:
                        if save:
                            theimage.save()
                            image_success = True
                    if save and image_success:
                        StudentPageCarouselItem.objects.get_or_create(page=sp, image=theimage)

                    newimageerrordict = dict((k, v) for k, v in imageerrors.iteritems() if v)
                    if newimageerrordict:
                        images_errors.append({image: newimageerrordict})
            errordict = dict((k, v) for k, v in sp_errs.iteritems() if v)
            if errordict:
                depterrors[sp.title] = errordict
        errordict = dict((k, v) for k, v in depterrors.iteritems() if v)
        if errordict:
            errors[theprogramme] = errordict
        print "%(student_count)s students" % { 'student_count': student_count }
        total_students += student_count
    print "%(d)s departments imported, total %(s)s students, %(sv)s saved (%(n)s new)" % {
            'd': dept_count,
            's': total_students,
            'sv': student_save_count,
            'n': new_count,
            }
    profile_not_found_count = 0
    image_not_found_count = 0
    for dept, depterrors in errors.iteritems():
        print '\n' + dept + '\n' + '='*len(dept)
        for name, sp_errs in depterrors.iteritems():
            if isinstance(sp_errs, dict):
                print name
                print sp_errs['image_not_found']
                profile_not_found_count += 1
    print '\nImage errors\n============'
    for image_dict in images_errors:
        for image, error_dict in image_dict.iteritems():
            if isinstance(error_dict, dict):
                print error_dict['image_not_found']
                image_not_found_count += 1

    print str(profile_not_found_count) + " profile images not found"
    print str(image_not_found_count) + " artwork images not found"
    print '\n\n'
    return images_errors, errors
Esempio n. 12
0
    def import_student(self, element):
        errors = {}

        # Basic info
        student_contentid = element.attrib["contentid"]
        student_title, errors["title"] = text_from_elem(element,
                                                        "title",
                                                        length=255,
                                                        textify=True)
        student_name, errors["name"] = text_from_elem(element,
                                                      "staffname",
                                                      length=255,
                                                      textify=True)
        student_programme, errors["programme"] = text_from_elem(element,
                                                                "programme",
                                                                length=255,
                                                                textify=True)
        student_biography, errors["biography"] = text_from_elem(
            element, "biography")
        student_school, errors["school"] = text_from_elem(element,
                                                          "school",
                                                          length=255,
                                                          textify=True)
        student_editorialreference, errors[
            "editorialreference"] = text_from_elem(element,
                                                   "editorialreference",
                                                   length=255,
                                                   textify=True)

        # If name is in ignore list, skip it
        if student_name in IGNORED_NAMES:
            return

        # Emails
        emails_element = element.find("emails")
        if emails_element is not None:
            student_emails = [
                email.text for email in emails_element.findall("email")
            ]
        else:
            student_emails = None

        # URLs
        urls_element = element.find("urls")
        if urls_element is not None:
            student_urls = [url.text for url in urls_element.findall("url")]
        else:
            student_urls = None

        # Supervisor
        student_supervisor = None
        supervisedstudents_element = element.find("supervisedstudents")
        if supervisedstudents_element is not None:
            student_supervisor_name = supervisedstudents_element.find(
                "supervisedstudent").text

            # Get page for supervisor
            student_supervisor = self.find_staff_page(student_supervisor_name)

        # Cleanup  biography
        student_biography = cleanup_html(student_biography)

        # Split name into first name, last name and title
        name_split = student_name.split()
        student_firstname = " ".join(name_split[:1])
        student_lastname = " ".join(name_split[1:])

        # Remove "Programme" from student_programme if it is there
        student_programme_split = student_programme.split()
        if len(student_programme_split) > 0:
            if student_programme_split[
                    -1] == "Programme" or student_programme_split[
                        -1] == "Programmes":
                student_programme = " ".join(student_programme_split[:-1])

        # Remove "\r" from beginning of student_programme if it is there
        if student_programme and student_programme[:2] == "\\r":
            student_programme = student_programme[2:]

        # Remove \n from beginning and end of student_school if it is there
        if student_school and student_school[:2] == "\\n":
            student_school = student_school[2:]
        if student_school and student_school[-2:] == "\\n":
            student_school = student_school[:-2]

        # If student is in STUDENT_PROGRAMMES list, then use the programme set there
        if student_name in STUDENT_PROGRAMMES:
            student_programme = STUDENT_PROGRAMMES[student_name]

        # Slugs
        student_programme_slug = constants.PROGRAMMES.get(
            student_programme, "")
        student_school_slug = constants.SCHOOLS.get(student_school, "")
        degree_subject_slug = constants.DEGREE_SUBJECTS.get(
            student_programme, "")

        # Create page for student
        try:
            studentpage = StudentPage.objects.get(
                rca_content_id=student_contentid)
        except StudentPage.DoesNotExist:
            studentpage = StudentPage()
            studentpage.rca_content_id = student_contentid
        studentpage.title = student_name
        studentpage.school = student_school_slug
        studentpage.programme = student_programme_slug
        studentpage.degree_qualification = "researchstudent"
        studentpage.degree_subject = degree_subject_slug
        studentpage.degree_year = ""
        studentpage.statement = student_biography
        studentpage.funding = student_title
        studentpage.show_on_homepage = False
        studentpage.show_on_programme_page = False
        studentpage.first_name = student_firstname
        studentpage.last_name = student_lastname
        studentpage.supervisor = student_supervisor
        studentpage.slug = make_slug(studentpage)
        if self.save:
            if studentpage.id:
                studentpage.save()
            else:
                self.student_index_page.add_child(studentpage)

        # Emails
        if student_emails is not None:
            for email in student_emails:
                StudentPageContactsEmail.objects.get_or_create(
                    page=studentpage, email=email)

        # URLS
        if student_urls is not None:
            for url in student_urls:
                StudentPageContactsWebsite.objects.get_or_create(
                    page=studentpage, website=url)

        # Images
        images_element = element.find("images")
        if images_element is not None:
            for image in images_element.findall("image"):
                # Import the image
                theimage, error = self.import_image(image)

                if theimage is not None and self.save:
                    # Add to carousel
                    StudentPageCarouselItem.objects.get_or_create(
                        page=studentpage, image=theimage)

        # Research pages
        researchpages_element = element.find("researchpages")
        if researchpages_element is not None:
            for researchpage in researchpages_element.findall("page"):
                self.import_student_researchpage(studentpage, researchpage)

            # Research child pages
            research_childpages_element = researchpages_element.find(
                "childpages")
            if research_childpages_element is not None:
                for childpage in research_childpages_element.findall("page"):
                    self.import_student_researchpage(studentpage, childpage)

        # Resave page
        if self.save:
            studentpage.save()
Esempio n. 13
0
def doimport(**kwargs):
    save = kwargs.get('save', False)
    path = kwargs.get('path', PATH)
    image_path = kwargs.get('image_path', IMAGE_PATH)
    show_index = SHOW_INDEX
    tree = ET.parse(path)
    root = tree.getroot()
    errors = {}
    images_errors = []
    dept_count = 0
    total_students = 0
    new_count = 0
    student_save_count = 0
    for d in root.findall('department'):
        dept_count += 1
        page = d.find('page')
        depterrors = {}
        dept_title, depterrors['title'] = text_from_elem(page, 'title')
        specialism = ''
        print '\nNow importing: ' + repr(dept_title)
        if dept_title in PROGRAMME_SPECIALISMS.keys():
            dept_title, specialism = PROGRAMME_SPECIALISMS[dept_title]
        print 'dept: ' + repr(dept_title)
        theprogramme = PROGRAMMES[dept_title]
        print 'prog: ' + repr(theprogramme)
        theschool = SCHOOLS[dept_title]
        print 'scho: ' + repr(theschool)

        h = html2text.HTML2Text()
        h.body_width = 0
        try:
            blurb = page.find('texts').findall('text')[0].find('content')
        except AttributeError:
            blurb = page.find('synopsis')
        blurb = h.handle(blurb.text).strip()
        print "Blurb: " + repr(blurb)
        print "******* note that the above text will not be imported *******"

        student_count = 0

        for s in d.findall('student'):
            student_count += 1
            s = s.find('studentpage')
            sp_contentid = s.attrib['contentid']
            try:
                sp = StudentPage.objects.get(rca_content_id=sp_contentid)
            except StudentPage.DoesNotExist:
                sp = StudentPage(rca_content_id=sp_contentid)
            sp_errs = {}

            sp.title, sp_errs['title'] = text_from_elem(s, 'title', length=255)
            # there is no intro text in any of the data at time of writing
            # intro, sp_errs['intro'] = text_from_elem(s, 'intro')
            sp.slug = make_slug(sp)
            statement = richtext_from_elem(s.find('statement'))

            statement_text, sponsors, collaborators = statement_extract(
                statement)
            sp.statement = statement_text
            sp.work_description = statement_text

            # handle the metadata fields
            metadata = s.find('metadata')
            # format the current degree
            sp.degree_year, sp_errs['deg_year'] = text_from_elem(metadata,
                                                                 'year',
                                                                 length=255)
            degree_subject, sp_errs['deg_subj'] = text_from_elem(metadata,
                                                                 'degrees',
                                                                 length=255)
            if degree_subject[-1] == '?':
                degree_subject = degree_subject[:-1]
            sp.degree_subject = DEGREE_SUBJECTS[degree_subject]
            degree_qualification, sp_errs['deg_qual'] = text_from_elem(
                metadata, 'degree', length=255)
            sp.degree_qualification = degree_qualification.lower()
            # metadata contains first and last names in separate fields
            sp.first_name, sp_errs['first_name'] = text_from_elem(metadata,
                                                                  'firstname',
                                                                  length=255)
            sp.last_name, sp_errs['last_name'] = text_from_elem(metadata,
                                                                'surname',
                                                                length=255)
            # we worked out the programme and school earlier from the dept_page
            sp.programme = theprogramme
            sp.school = theschool
            if not specialism and metadata.find('specialism') is not None:
                sp.specialism, sp_errs['specialism'] = text_from_elem(
                    metadata, 'specialism')
            else:
                sp.specialism = specialism
            # handle profile image
            try:
                profile_image = RcaImage.objects.get(
                    rca_content_id=sp_contentid + 'profile_image')
            except RcaImage.DoesNotExist:
                profile_image = RcaImage(rca_content_id=sp_contentid +
                                         'profile_image')
            profile_filename = slugify(unicode(sp.title)).replace('-', '_')
            profile_image_path = image_path + "show_2013_profiles/2400_" + sp.programme + "/"
            profile_image.title = sp.title + ' profile image'
            if not profile_image.id:
                try:
                    with File(
                            open(
                                normalize(
                                    "NFKD", profile_image_path +
                                    profile_filename + '.jpg'), 'rb')) as f:
                        profile_image.file = f
                        if save:
                            profile_image.save()
                except IOError as e:
                    try:
                        with File(
                                open(
                                    normalize(
                                        "NFKD", profile_image_path +
                                        profile_filename + '.png'),
                                    'rb')) as f:
                            profile_image.file = f
                            if save:
                                profile_image.save()
                    except IOError as e:
                        print "I/O error({0}): {1}".format(
                            e.errno, e.strerror
                        ) + " " + profile_image_path + profile_filename
                        sp_errs[
                            'image_not_found'] = profile_image_path + profile_filename
                except ValueError:
                    print "Could not convert data to an integer."
                except:
                    import sys
                    print "Unexpected error:", sys.exc_info()[0]
                    raise
            else:
                if save:
                    profile_image.save()
            sp.profile_image = profile_image

            # save the studentpage for foreignkey purposes
            if save:
                student_save_count += 1
                if sp.id:
                    sp.save()
                else:
                    new_count += 1
                    show_index.add_child(sp)
            elif not sp.id:
                new_count += 1

            # handle the sponsors and collaborators from earlier
            for spon in sponsors:
                name, sp_errs['sponsors'] = check_length(spon, 255)
                if save:
                    sponpage = StudentPageWorkSponsor(page=sp, name=name)
                    sponpage.save()
            for col in collaborators:
                name, sp_errs['collaborators'] = check_length(col, 255)
                if save:
                    colpage = StudentPageWorkCollaborator(page=sp, name=name)
                    colpage.save()

            # handle the cv fields
            cv = s.find('cv')

            sp_errs['degree'] = cv_handle(cv,
                                          'degrees',
                                          StudentPageDegree,
                                          sp,
                                          length=255,
                                          fieldname='degree',
                                          save=save)
            sp_errs['exhibition'] = cv_handle(cv,
                                              'exhibition',
                                              StudentPageExhibition,
                                              sp,
                                              length=255,
                                              save=save)
            sp_errs['experience'] = cv_handle(cv,
                                              'experience',
                                              StudentPageExperience,
                                              sp,
                                              length=255,
                                              save=save)
            sp_errs['awards'] = cv_handle(cv,
                                          'awards',
                                          StudentPageAwards,
                                          sp,
                                          length=255,
                                          fieldname='award',
                                          save=save)
            if cv.find('sponsors') is not None:
                sp_errs['sponsors'] = cv_handle(cv,
                                                'sponsors',
                                                StudentPageWorkSponsor,
                                                sp,
                                                length=255,
                                                fieldname='name',
                                                save=save)
            # currently the model doesn't have publications or conferences
            #sp_errs['publications'] = cv_handle(
            #        cv, 'publications', StudentPagePublications, sp, length=255)
            #sp_errs['conferences'] = cv_handle(
            #        cv, 'conferences', StudentPageConferences, sp, length=255)

            if s.find('emails') is not None:
                for emailaddress in s.find('emails').getchildren():
                    emailtext = emailaddress.text.strip()
                    if save:
                        StudentPageContactsEmail.objects.get_or_create(
                            page=sp, email=emailtext)

            if s.find('phonenumbers') is not None:
                for num in s.find('phonenumbers').getchildren():
                    if num.text:
                        phonenumber = num.text.strip()
                        if save:
                            StudentPageContactsPhone.objects.get_or_create(
                                page=sp, phone=phonenumber)

            if s.find('urls') is not None:
                for url in s.find('urls').getchildren():
                    if url.text:
                        urltext = url.text.strip()
                        if save:
                            StudentPageContactsWebsite.objects.get_or_create(
                                page=sp, website=urltext)

            # handle images tag
            images = s.find('images')
            forloop_counter = 0
            if images is not None:
                for image in images.findall('image'):
                    forloop_counter += 1
                    imageerrors = {}
                    metadata = image.find('imagemetadata')
                    im_contentid = image.attrib['contentid']
                    if not im_contentid:
                        im_contentid = sp_contentid + '_image_' + str(
                            forloop_counter)
                    try:
                        theimage = RcaImage.objects.get(
                            rca_content_id=im_contentid)
                    except RcaImage.DoesNotExist:
                        theimage = RcaImage(rca_content_id=im_contentid)
                    theimage.title, imageerrors['title'] = text_from_elem(
                        metadata, 'title', length=255, textify=True)
                    theimage.creator, imageerrors['creator'] = text_from_elem(
                        metadata, 'creator', length=255, textify=True)
                    theimage.medium, imageerrors['medium'] = text_from_elem(
                        metadata, 'media', length=255, textify=True)
                    photographer, imageerrors['photographer'] = text_from_elem(
                        metadata, 'photographer', length=255)
                    if photographer.strip().startswith('&copy;'):
                        photographer = photographer.replace('&copy;',
                                                            '').strip()
                    theimage.photographer = photographer
                    theimage.permissions, imageerrors[
                        'permissions'] = text_from_elem(metadata,
                                                        'rights',
                                                        length=255)

                    caption, imageerrors['caption'] = text_from_elem(
                        metadata, 'caption', length=255, textify=True)
                    theimage.alt = caption

                    #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255)
                    #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255)

                    filename = unicode(
                        urllib2.unquote(image.find('filename').text.strip()))
                    image_success = False
                    full_image_path = image_path + '2400_' + sp.programme + "/"
                    if not theimage.id:
                        try:
                            with File(
                                    open(
                                        normalize("NFKD",
                                                  full_image_path + filename),
                                        'rb')) as f:
                                theimage.file = f
                                if save:
                                    theimage.save()
                                    image_success = True
                        except IOError as e:
                            try:
                                with File(
                                        open(
                                            normalize(
                                                "NFKD", full_image_path +
                                                filename[:-4] + '.png'),
                                            'rb')) as f:
                                    theimage.file = f
                                    if save:
                                        theimage.save()
                                        image_success = True
                            except IOError as e:
                                print "I/O error({0}): {1}".format(
                                    e.errno, e.strerror)
                                print full_image_path + filename
                                imageerrors[
                                    'image_not_found'] = full_image_path + filename
                        except ValueError:
                            print "Could not convert data to an integer."
                        except:
                            import sys
                            print "Unexpected error:", sys.exc_info()[0]
                            raise
                    else:
                        if save:
                            theimage.save()
                            image_success = True
                    if save and image_success:
                        StudentPageCarouselItem.objects.get_or_create(
                            page=sp, image=theimage)

                    newimageerrordict = dict(
                        (k, v) for k, v in imageerrors.iteritems() if v)
                    if newimageerrordict:
                        images_errors.append({image: newimageerrordict})
            errordict = dict((k, v) for k, v in sp_errs.iteritems() if v)
            if errordict:
                depterrors[sp.title] = errordict
        errordict = dict((k, v) for k, v in depterrors.iteritems() if v)
        if errordict:
            errors[theprogramme] = errordict
        print "%(student_count)s students" % {'student_count': student_count}
        total_students += student_count
    print "%(d)s departments imported, total %(s)s students, %(sv)s saved (%(n)s new)" % {
        'd': dept_count,
        's': total_students,
        'sv': student_save_count,
        'n': new_count,
    }
    profile_not_found_count = 0
    image_not_found_count = 0
    for dept, depterrors in errors.iteritems():
        print '\n' + dept + '\n' + '=' * len(dept)
        for name, sp_errs in depterrors.iteritems():
            if isinstance(sp_errs, dict):
                print name
                print sp_errs['image_not_found']
                profile_not_found_count += 1
    print '\nImage errors\n============'
    for image_dict in images_errors:
        for image, error_dict in image_dict.iteritems():
            if isinstance(error_dict, dict):
                print error_dict['image_not_found']
                image_not_found_count += 1

    print str(profile_not_found_count) + " profile images not found"
    print str(image_not_found_count) + " artwork images not found"
    print '\n\n'
    return images_errors, errors