def import_texts(element): errors = {} html = "" first_text = True for text in element.findall('text'): if first_text: first_text = False else: html = "".join([html, "<hr>"]) text_title, errors["title"] = text_from_elem(text, 'title', length=255) text_content, errors["content"] = text_from_elem(text, 'content') html = "".join([html, "<b>", text_title, "</b>", cleanup_html(text_content)]) return html, errors
def import_texts(element): errors = {} html = "" first_text = True for text in element.findall('text'): if first_text: first_text = False else: html = "".join([html, "<hr>"]) text_title, errors["title"] = text_from_elem(text, 'title', length=255) text_content, errors["content"] = text_from_elem(text, 'content') html = "".join( [html, "<b>", text_title, "</b>", cleanup_html(text_content)]) return html, errors
def import_department(self, element): errors = {} # Basic info department_contentid = element.find("departmentname").attrib["contentid"] department_name, errors["name"] = text_from_elem(element, "departmentname", length=255) # Students for student in element.findall("staff"): self.import_student(student)
def import_department(element): errors = {} # Basic info department_contentid = element.find('departmentname').attrib['contentid'] department_name, errors['name'] = text_from_elem(element, 'departmentname', length=255) # Staff for staff in element.findall('staff'): import_staff(staff) return errors
def import_department(self, element): errors = {} # Basic info department_contentid = element.find( "departmentname").attrib["contentid"] department_name, errors["name"] = text_from_elem(element, "departmentname", length=255) # Students for student in element.findall("staff"): self.import_student(student)
def import_image(self, element): errors = {} # Get image info image_contentid = element.attrib['contentid'] image_filename, errors['filename'] = text_from_elem(element, 'filename', length=255, textify=True) image_caption, errors['caption'] = text_from_elem(element, 'caption', length=255) image_metadata = element.find('imagemetadata') image_title, errors['title'] = text_from_elem(image_metadata, 'title', length=255) image_creator, errors['creator'] = text_from_elem(image_metadata, 'creator', length=255, textify=True) image_media, errors['media'] = text_from_elem(image_metadata, 'media', length=255, textify=True) image_photographer, errors['photographer'] = text_from_elem(image_metadata, 'photographer', length=255, textify=True) image_rights, errors['rights'] = text_from_elem(image_metadata, 'rights', length=255, textify=True) # Create image try: image = RcaImage.objects.get(rca_content_id=image_contentid) except RcaImage.DoesNotExist: image = RcaImage() image.rca_content_id = image_contentid image.title = image_title image.alt = image_caption image.creator = image_creator image.medium = image_media image.photographer = image_photographer image.permission = image_rights if self.save: # Load image file if not image.id: try: with File(open(self.image_path + image_filename.encode('utf-8'), 'r')) as f: image.file = f image.save() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print repr(image_filename) return None, None except ValueError: print "Could not convert data to an integer." return None, None except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: image.save() return image, errors
def import_student_researchpage(self, studentpage, element): errors = {} # Get page info page_contentid = element.attrib["contentid"] page_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True) page_texts, errors["texts"] = self.import_texts(element.find("texts")) # Create research item try: researchitem = ResearchItem.objects.get( rca_content_id=page_contentid) except ResearchItem.DoesNotExist: researchitem = ResearchItem() researchitem.rca_content_id = page_contentid researchitem.title = page_title researchitem.research_type = "student" researchitem.description = page_texts researchitem.school = studentpage.school researchitem.programme = studentpage.programme researchitem.slug = make_slug(researchitem) if self.save: if researchitem.id: researchitem.save() else: self.research_index_page.add_child(researchitem) # Link to creator ResearchItemCreator.objects.get_or_create(page=researchitem, person=studentpage) # Get carousel images images_element = element.find("images") if images_element is not None: for image in images_element.findall("image"): # Import the image theimage, error = self.import_image(image) # Add to carousel if theimage is not None and self.save: ResearchItemCarouselItem.objects.get_or_create( page=researchitem, image=theimage) return errors
def import_student_researchpage(self, studentpage, element): errors = {} # Get page info page_contentid = element.attrib["contentid"] page_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True) page_texts, errors["texts"] = self.import_texts(element.find("texts")) # Create research item try: researchitem = ResearchItem.objects.get(rca_content_id=page_contentid) except ResearchItem.DoesNotExist: researchitem = ResearchItem() researchitem.rca_content_id = page_contentid researchitem.title = page_title researchitem.research_type = "student" researchitem.description = page_texts researchitem.school = studentpage.school researchitem.programme = studentpage.programme researchitem.slug = make_slug(researchitem) if self.save: if researchitem.id: researchitem.save() else: self.research_index_page.add_child(researchitem) # Link to creator ResearchItemCreator.objects.get_or_create(page=researchitem, person=studentpage) # Get carousel images images_element = element.find("images") if images_element is not None: for image in images_element.findall("image"): # Import the image theimage, error = self.import_image(image) # Add to carousel if theimage is not None and self.save: ResearchItemCarouselItem.objects.get_or_create(page=researchitem, image=theimage) return errors
def import_image(element): errors = {} # Get image info image_contentid = element.attrib['contentid'] image_filename, errors['filename'] = text_from_elem(element, 'filename', length=255, textify=True) image_caption, errors['caption'] = text_from_elem(element, 'caption', length=255) image_metadata = element.find('imagemetadata') image_title, errors['title'] = text_from_elem(image_metadata, 'title', length=255, textify=True) image_creator, errors['creator'] = text_from_elem(image_metadata, 'creator', length=255, textify=True) image_media, errors['media'] = text_from_elem(image_metadata, 'media', length=255, textify=True) image_photographer, errors['photographer'] = text_from_elem(image_metadata, 'photographer', length=255, textify=True) image_rights, errors['rights'] = text_from_elem(image_metadata, 'rights', length=255, textify=True) # Create image try: image = RcaImage.objects.get(rca_content_id=image_contentid) except RcaImage.DoesNotExist: image = RcaImage() image.rca_content_id = image_contentid image.title = image_title image.alt = image_caption image.creator = image_creator image.medium = image_media image.photographer = image_photographer image.permission = image_rights # Load image file if not image.id: try: with File(open(IMAGE_PATH + image_filename.encode('utf-8'), 'r')) as f: image.file = f image.save() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print repr(image_filename) return None, None except ValueError: print "Could not convert data to an integer." return None, None except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: image.save() return image, errors
def import_staff(element): errors = {} # Basic info staff_contentid = element.attrib['contentid'] staff_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True) staff_name, errors['name'] = text_from_elem(element, 'staffname', length=255, textify=True) staff_programme, errors['programme'] = text_from_elem(element, 'programme', length=255, textify=True) staff_statement, errors['statement'] = text_from_elem(element, 'statement') staff_biography, errors['biography'] = text_from_elem(element, 'biography') staff_school, errors['school'] = text_from_elem(element, 'school', length=255, textify=True) staff_editorialreference, errors['editorialreference'] = text_from_elem(element, 'editorialreference', length=255, textify=True) # Emails emails_element = element.find('emails') if emails_element is not None: staff_emails = [email.text for email in emails_element.findall('email')] else: staff_emails = [] # URLs urls_element = element.find('urls') if urls_element is not None: staff_urls = [url.text for url in urls_element.findall('url')] else: staff_urls = [] # Supervised students staff_supervisedstudents = [] supervisedstudents_element = element.find('supervisedstudents') if supervisedstudents_element is not None: for supervisedstudent in supervisedstudents_element.findall('supervisedstudent'): supervised_student = supervisedstudent.text if supervised_student is not None: staff_supervisedstudents.append(supervisedstudent.text) # Cleanup statement and biography staff_statement = cleanup_html(staff_statement) staff_biography = cleanup_html(staff_biography) # Split name into first name, last name and title # A L Rees needs to be split up manually if staff_name == "A L Rees": staff_titleprefix = "" staff_firstname = "A L" staff_lastname = "Rees" else: name_split = staff_name.split() if name_split[0] == "Professor" or name_split[0] == "Dr" or name_split[0] == "Sir": staff_titleprefix = name_split[0] staff_firstname = name_split[1] staff_lastname = " ".join(name_split[2:]) else: staff_titleprefix = "" staff_firstname = " ".join(name_split[:1]) staff_lastname = " ".join(name_split[1:]) # Remove "Programme" from staff_programme if it is there staff_programme_split = staff_programme.split() if staff_programme_split[-1] == "Programme" or staff_programme_split[-1] == "Programmes": staff_programme = " ".join(staff_programme_split[:-1]) # Slugs staff_programme_slug = constants.PROGRAMMES.get(staff_programme, "") staff_school_slug = constants.SCHOOLS.get(staff_programme, "") # Create page for staff member try: staffpage = StaffPage.objects.get(rca_content_id=staff_contentid) except StaffPage.DoesNotExist: staffpage = StaffPage() staffpage.rca_content_id = staff_contentid staffpage.title = staff_name staffpage.school = staff_school_slug staffpage.staff_type = "academic" staffpage.intro = staff_statement staffpage.biography = staff_biography staffpage.show_on_homepage = False staffpage.show_on_programme_page = False staffpage.title_prefix = staff_titleprefix staffpage.first_name = staff_firstname staffpage.last_name = staff_lastname if len(staff_supervisedstudents) > 0: staffpage.supervised_student_other = ", ".join(staff_supervisedstudents) staffpage.slug = make_slug(staffpage) if staffpage.id: staffpage.save() else: STAFF_INDEX_PAGE.add_child(staffpage) # Create role try: staffpagerole = StaffPageRole.objects.get(page=staffpage) except StaffPageRole.DoesNotExist: staffpagerole = StaffPageRole() staffpagerole.page=staffpage staffpagerole.title = staff_title staffpagerole.school = staff_school_slug staffpagerole.programme = staff_programme_slug if len(staff_emails) > 0: staffpagerole.email = staff_emails[0] staffpagerole.save() # Images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage) # Research pages researchpages_element = element.find('researchpages') if researchpages_element is not None: for researchpage in researchpages_element.findall('page'): import_staff_researchpage(staffpage, researchpage) # Research child pages research_childpages_element = researchpages_element.find('childpages') if research_childpages_element is not None: for childpage in research_childpages_element.findall('page'): import_staff_researchpage(staffpage, childpage) # Append URLs to bottom of practise block urls_html = "<ul>" for url in staff_urls: url_valid = url if "://" not in url_valid: url_valid = "http://" + url_valid urls_html += "<li><a href=\"" + url_valid + "\">" + url + "</a></li>" urls_html += "</ul>" staffpage.practice += urls_html # Resave page staffpage.save() return errors
def import_staff_researchpage(staffpage, element): errors = {} # Get page info page_contentid = element.attrib['contentid'] page_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True) page_texts, errors['texts'] = import_texts(element.find('texts')) # Check if this is an interesting page if page_title in interesting_pages: # Set the field for this page setattr(staffpage, interesting_pages[page_title], page_texts) # Get carousel images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage) else: # Get school and programme from staffpage role try: staffpagerole = StaffPageRole.objects.get(page=staffpage) school = staffpagerole.school programme = staffpagerole.programme except StaffPageRole.DoesNotExist: school = "" programme = "" # Create research item try: researchitem = ResearchItem.objects.get(rca_content_id=page_contentid) except ResearchItem.DoesNotExist: researchitem = ResearchItem() researchitem.rca_content_id = page_contentid researchitem.title = page_title researchitem.research_type = "staff" researchitem.description = page_texts researchitem.school = school researchitem.programme = programme researchitem.slug = make_slug(researchitem) if researchitem.id: researchitem.save() else: RESEARCH_INDEX_PAGE.add_child(researchitem) # Link to creator ResearchItemCreator.objects.get_or_create(page=researchitem, person=staffpage) # Get carousel images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: ResearchItemCarouselItem.objects.get_or_create(page=researchitem, image=theimage) return errors
def import_student(self, element): errors = {} # Basic info student_contentid = element.attrib["contentid"] student_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True) student_name, errors["name"] = text_from_elem(element, "staffname", length=255, textify=True) student_programme, errors["programme"] = text_from_elem(element, "programme", length=255, textify=True) student_biography, errors["biography"] = text_from_elem( element, "biography") student_school, errors["school"] = text_from_elem(element, "school", length=255, textify=True) student_editorialreference, errors[ "editorialreference"] = text_from_elem(element, "editorialreference", length=255, textify=True) # If name is in ignore list, skip it if student_name in IGNORED_NAMES: return # Emails emails_element = element.find("emails") if emails_element is not None: student_emails = [ email.text for email in emails_element.findall("email") ] else: student_emails = None # URLs urls_element = element.find("urls") if urls_element is not None: student_urls = [url.text for url in urls_element.findall("url")] else: student_urls = None # Supervisor student_supervisor = None supervisedstudents_element = element.find("supervisedstudents") if supervisedstudents_element is not None: student_supervisor_name = supervisedstudents_element.find( "supervisedstudent").text # Get page for supervisor student_supervisor = self.find_staff_page(student_supervisor_name) # Cleanup biography student_biography = cleanup_html(student_biography) # Split name into first name, last name and title name_split = student_name.split() student_firstname = " ".join(name_split[:1]) student_lastname = " ".join(name_split[1:]) # Remove "Programme" from student_programme if it is there student_programme_split = student_programme.split() if len(student_programme_split) > 0: if student_programme_split[ -1] == "Programme" or student_programme_split[ -1] == "Programmes": student_programme = " ".join(student_programme_split[:-1]) # Remove "\r" from beginning of student_programme if it is there if student_programme and student_programme[:2] == "\\r": student_programme = student_programme[2:] # Remove \n from beginning and end of student_school if it is there if student_school and student_school[:2] == "\\n": student_school = student_school[2:] if student_school and student_school[-2:] == "\\n": student_school = student_school[:-2] # If student is in STUDENT_PROGRAMMES list, then use the programme set there if student_name in STUDENT_PROGRAMMES: student_programme = STUDENT_PROGRAMMES[student_name] # Slugs student_programme_slug = constants.PROGRAMMES.get( student_programme, "") student_school_slug = constants.SCHOOLS.get(student_school, "") degree_subject_slug = constants.DEGREE_SUBJECTS.get( student_programme, "") # Create page for student try: studentpage = StudentPage.objects.get( rca_content_id=student_contentid) except StudentPage.DoesNotExist: studentpage = StudentPage() studentpage.rca_content_id = student_contentid studentpage.title = student_name studentpage.school = student_school_slug studentpage.programme = student_programme_slug studentpage.degree_qualification = "researchstudent" studentpage.degree_subject = degree_subject_slug studentpage.degree_year = "" studentpage.statement = student_biography studentpage.funding = student_title studentpage.show_on_homepage = False studentpage.show_on_programme_page = False studentpage.first_name = student_firstname studentpage.last_name = student_lastname studentpage.supervisor = student_supervisor studentpage.slug = make_slug(studentpage) if self.save: if studentpage.id: studentpage.save() else: self.student_index_page.add_child(studentpage) # Emails if student_emails is not None: for email in student_emails: StudentPageContactsEmail.objects.get_or_create( page=studentpage, email=email) # URLS if student_urls is not None: for url in student_urls: StudentPageContactsWebsite.objects.get_or_create( page=studentpage, website=url) # Images images_element = element.find("images") if images_element is not None: for image in images_element.findall("image"): # Import the image theimage, error = self.import_image(image) if theimage is not None and self.save: # Add to carousel StudentPageCarouselItem.objects.get_or_create( page=studentpage, image=theimage) # Research pages researchpages_element = element.find("researchpages") if researchpages_element is not None: for researchpage in researchpages_element.findall("page"): self.import_student_researchpage(studentpage, researchpage) # Research child pages research_childpages_element = researchpages_element.find( "childpages") if research_childpages_element is not None: for childpage in research_childpages_element.findall("page"): self.import_student_researchpage(studentpage, childpage) # Resave page if self.save: studentpage.save()
def import_student(self, element): errors = {} # Basic info student_contentid = element.attrib["contentid"] student_title, errors["title"] = text_from_elem(element, "title", length=255, textify=True) student_name, errors["name"] = text_from_elem(element, "staffname", length=255, textify=True) student_programme, errors["programme"] = text_from_elem(element, "programme", length=255, textify=True) student_biography, errors["biography"] = text_from_elem(element, "biography") student_school, errors["school"] = text_from_elem(element, "school", length=255, textify=True) student_editorialreference, errors["editorialreference"] = text_from_elem(element, "editorialreference", length=255, textify=True) # If name is in ignore list, skip it if student_name in IGNORED_NAMES: return # Emails emails_element = element.find("emails") if emails_element is not None: student_emails = [email.text for email in emails_element.findall("email")] else: student_emails = None # URLs urls_element = element.find("urls") if urls_element is not None: student_urls = [url.text for url in urls_element.findall("url")] else: student_urls = None # Supervisor student_supervisor = None supervisedstudents_element = element.find("supervisedstudents") if supervisedstudents_element is not None: student_supervisor_name = supervisedstudents_element.find("supervisedstudent").text # Get page for supervisor student_supervisor = self.find_staff_page(student_supervisor_name) # Cleanup biography student_biography = cleanup_html(student_biography) # Split name into first name, last name and title name_split = student_name.split() student_firstname = " ".join(name_split[:1]) student_lastname = " ".join(name_split[1:]) # Remove "Programme" from student_programme if it is there student_programme_split = student_programme.split() if len(student_programme_split) > 0: if student_programme_split[-1] == "Programme" or student_programme_split[-1] == "Programmes": student_programme = " ".join(student_programme_split[:-1]) # Remove "\r" from beginning of student_programme if it is there if student_programme and student_programme[:2] == "\\r": student_programme = student_programme[2:] # Remove \n from beginning and end of student_school if it is there if student_school and student_school[:2] == "\\n": student_school = student_school[2:] if student_school and student_school[-2:] == "\\n": student_school = student_school[:-2] # If student is in STUDENT_PROGRAMMES list, then use the programme set there if student_name in STUDENT_PROGRAMMES: student_programme = STUDENT_PROGRAMMES[student_name] # Slugs student_programme_slug = constants.PROGRAMMES.get(student_programme, "") student_school_slug = constants.SCHOOLS.get(student_school, "") degree_subject_slug = constants.DEGREE_SUBJECTS.get(student_programme, "") # Create page for student try: studentpage = StudentPage.objects.get(rca_content_id=student_contentid) except StudentPage.DoesNotExist: studentpage = StudentPage() studentpage.rca_content_id = student_contentid studentpage.title = student_name studentpage.school = student_school_slug studentpage.programme = student_programme_slug studentpage.degree_qualification = "researchstudent" studentpage.degree_subject = degree_subject_slug studentpage.degree_year = "" studentpage.statement = student_biography studentpage.funding = student_title studentpage.show_on_homepage = False studentpage.show_on_programme_page = False studentpage.first_name = student_firstname studentpage.last_name = student_lastname studentpage.supervisor = student_supervisor studentpage.slug = make_slug(studentpage) if self.save: if studentpage.id: studentpage.save() else: self.student_index_page.add_child(studentpage) # Emails if student_emails is not None: for email in student_emails: StudentPageContactsEmail.objects.get_or_create(page=studentpage, email=email) # URLS if student_urls is not None: for url in student_urls: StudentPageContactsWebsite.objects.get_or_create(page=studentpage, website=url) # Images images_element = element.find("images") if images_element is not None: for image in images_element.findall("image"): # Import the image theimage, error = self.import_image(image) if theimage is not None and self.save: # Add to carousel StudentPageCarouselItem.objects.get_or_create(page=studentpage, image=theimage) # Research pages researchpages_element = element.find("researchpages") if researchpages_element is not None: for researchpage in researchpages_element.findall("page"): self.import_student_researchpage(studentpage, researchpage) # Research child pages research_childpages_element = researchpages_element.find("childpages") if research_childpages_element is not None: for childpage in research_childpages_element.findall("page"): self.import_student_researchpage(studentpage, childpage) # Resave page if self.save: studentpage.save()
def import_staff(element): errors = {} # Basic info staff_contentid = element.attrib['contentid'] staff_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True) staff_name, errors['name'] = text_from_elem(element, 'staffname', length=255, textify=True) staff_programme, errors['programme'] = text_from_elem(element, 'programme', length=255, textify=True) staff_statement, errors['statement'] = text_from_elem(element, 'statement') staff_biography, errors['biography'] = text_from_elem(element, 'biography') staff_school, errors['school'] = text_from_elem(element, 'school', length=255, textify=True) staff_editorialreference, errors['editorialreference'] = text_from_elem( element, 'editorialreference', length=255, textify=True) # Emails emails_element = element.find('emails') if emails_element is not None: staff_emails = [ email.text for email in emails_element.findall('email') ] else: staff_emails = [] # URLs urls_element = element.find('urls') if urls_element is not None: staff_urls = [url.text for url in urls_element.findall('url')] else: staff_urls = [] # Supervised students staff_supervisedstudents = [] supervisedstudents_element = element.find('supervisedstudents') if supervisedstudents_element is not None: for supervisedstudent in supervisedstudents_element.findall( 'supervisedstudent'): supervised_student = supervisedstudent.text if supervised_student is not None: staff_supervisedstudents.append(supervisedstudent.text) # Cleanup statement and biography staff_statement = cleanup_html(staff_statement) staff_biography = cleanup_html(staff_biography) # Split name into first name, last name and title # A L Rees needs to be split up manually if staff_name == "A L Rees": staff_titleprefix = "" staff_firstname = "A L" staff_lastname = "Rees" else: name_split = staff_name.split() if name_split[0] == "Professor" or name_split[0] == "Dr" or name_split[ 0] == "Sir": staff_titleprefix = name_split[0] staff_firstname = name_split[1] staff_lastname = " ".join(name_split[2:]) else: staff_titleprefix = "" staff_firstname = " ".join(name_split[:1]) staff_lastname = " ".join(name_split[1:]) # Remove "Programme" from staff_programme if it is there staff_programme_split = staff_programme.split() if staff_programme_split[-1] == "Programme" or staff_programme_split[ -1] == "Programmes": staff_programme = " ".join(staff_programme_split[:-1]) # Slugs staff_programme_slug = constants.PROGRAMMES.get(staff_programme, "") staff_school_slug = constants.SCHOOLS.get(staff_programme, "") # Create page for staff member try: staffpage = StaffPage.objects.get(rca_content_id=staff_contentid) except StaffPage.DoesNotExist: staffpage = StaffPage() staffpage.rca_content_id = staff_contentid staffpage.title = staff_name staffpage.school = staff_school_slug staffpage.staff_type = "academic" staffpage.intro = staff_statement staffpage.biography = staff_biography staffpage.show_on_homepage = False staffpage.show_on_programme_page = False staffpage.title_prefix = staff_titleprefix staffpage.first_name = staff_firstname staffpage.last_name = staff_lastname if len(staff_supervisedstudents) > 0: staffpage.supervised_student_other = ", ".join( staff_supervisedstudents) staffpage.slug = make_slug(staffpage) if staffpage.id: staffpage.save() else: STAFF_INDEX_PAGE.add_child(staffpage) # Create role try: staffpagerole = StaffPageRole.objects.get(page=staffpage) except StaffPageRole.DoesNotExist: staffpagerole = StaffPageRole() staffpagerole.page = staffpage staffpagerole.title = staff_title staffpagerole.school = staff_school_slug staffpagerole.programme = staff_programme_slug if len(staff_emails) > 0: staffpagerole.email = staff_emails[0] staffpagerole.save() # Images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage) # Research pages researchpages_element = element.find('researchpages') if researchpages_element is not None: for researchpage in researchpages_element.findall('page'): import_staff_researchpage(staffpage, researchpage) # Research child pages research_childpages_element = researchpages_element.find('childpages') if research_childpages_element is not None: for childpage in research_childpages_element.findall('page'): import_staff_researchpage(staffpage, childpage) # Append URLs to bottom of practise block urls_html = "<ul>" for url in staff_urls: url_valid = url if "://" not in url_valid: url_valid = "http://" + url_valid urls_html += "<li><a href=\"" + url_valid + "\">" + url + "</a></li>" urls_html += "</ul>" staffpage.practice += urls_html # Resave page staffpage.save() return errors
def import_staff_researchpage(staffpage, element): errors = {} # Get page info page_contentid = element.attrib['contentid'] page_title, errors['title'] = text_from_elem(element, 'title', length=255, textify=True) page_texts, errors['texts'] = import_texts(element.find('texts')) # Check if this is an interesting page if page_title in interesting_pages: # Set the field for this page setattr(staffpage, interesting_pages[page_title], page_texts) # Get carousel images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: StaffPageCarouselItem.objects.get_or_create(page=staffpage, image=theimage) else: # Get school and programme from staffpage role try: staffpagerole = StaffPageRole.objects.get(page=staffpage) school = staffpagerole.school programme = staffpagerole.programme except StaffPageRole.DoesNotExist: school = "" programme = "" # Create research item try: researchitem = ResearchItem.objects.get( rca_content_id=page_contentid) except ResearchItem.DoesNotExist: researchitem = ResearchItem() researchitem.rca_content_id = page_contentid researchitem.title = page_title researchitem.research_type = "staff" researchitem.description = page_texts researchitem.school = school researchitem.programme = programme researchitem.slug = make_slug(researchitem) if researchitem.id: researchitem.save() else: RESEARCH_INDEX_PAGE.add_child(researchitem) # Link to creator ResearchItemCreator.objects.get_or_create(page=researchitem, person=staffpage) # Get carousel images images_element = element.find('images') if images_element is not None: for image in images_element.findall('image'): # Import the image theimage, error = import_image(image) # Add to carousel if theimage is not None: ResearchItemCarouselItem.objects.get_or_create( page=researchitem, image=theimage) return errors
def doimport(**kwargs): path = kwargs.get('path', PATH) save = kwargs.get('save', False) image_path = kwargs.get('image_path', IMAGE_PATH) ruthless = kwargs.get('ruthless', False) newsindex = NEWS_INDEX tree = ET.parse(path) root = tree.getroot() errors = [] images_errors = [] for item in root.findall('news_item'): itemerrors = {} # sort out what instance this is news_contentid = item.attrib['contentid'] title, itemerrors['title'] = text_from_elem(item, 'title', length=255) date = parse_date( item.find('goinglivedate').text.strip().replace( '.', '-')) or datetime.date.today() try: newsitem = NewsItem.objects.get(rca_content_id=news_contentid) except NewsItem.DoesNotExist: newsitem = NewsItem(rca_content_id=news_contentid) newsitem.title = title newsitem.date = date newsitem.intro = richtext_from_elem(item.find('intro')) newsitem.slug = make_slug(newsitem) # possibly delete any images that are embedded in the existing body if ruthless: soup = BeautifulSoup(newsitem.body, 'html.parser') to_delete_ids = [] for x in soup.find_all('embed'): try: to_delete_ids.append(int(x.attrs['id'])) except ValueError: pass if to_delete_ids: RcaImage.objects.filter(id__in=to_delete_ids).delete() # build the body strings = [] if item.find('texts'): for elem in item.find('texts').findall('text'): html = richtext_from_elem(elem.find('content')) strings.append(html) newsitem.body = '\n'.join(strings) # save newsitem if save: if newsitem.id: newsitem.save() else: newsindex.add_child(newsitem) tobesaved = False if item.find('images') is not None: # first delete images that haven't got a contentid if ruthless: for c in NewsItemCarouselItem.objects.filter(page=newsitem): c.image.delete() c.delete() for image in item.find('images').findall('image'): imageerrors = {} metadata = image.find('imagemetadata') im_contentid = image.attrib['contentid'] filename = urllib2.unquote(image.find('filename').text.strip()) try: theimage = RcaImage.objects.get( rca_content_id=im_contentid) except RcaImage.DoesNotExist: theimage = RcaImage(rca_content_id=im_contentid) theimage.title, imageerrors['title'] = text_from_elem( metadata, 'title', length=255, textify=True) theimage.creator, imageerrors['creator'] = text_from_elem( metadata, 'creator', length=255, textify=True) theimage.medium, imageerrors['medium'] = text_from_elem( metadata, 'media', length=255, textify=True) theimage.photographer, imageerrors['photog'] = text_from_elem( metadata, 'photographer', length=255, textify=True) theimage.permission, imageerrors['perms'] = text_from_elem( metadata, 'rights', length=255, textify=True) caption, imageerrors['caption'] = text_from_elem(metadata, 'caption', length=255, textify=True) theimage.alt = caption #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255) #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255) try: with File(open(image_path + filename.encode('utf-8'), 'r')) as f: if theimage.id: if save: theimage.delete() theimage.file = f if save: theimage.save() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print repr(filename) except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise if save and theimage.is_landscape(): try: carousel = NewsItemCarouselItem.objects.get( page=newsitem, image=theimage, ) except NewsItemCarouselItem.DoesNotExist: carousel = NewsItemCarouselItem( page=newsitem, image=theimage, ) if save: carousel.save() elif save and theimage.id: imagestring = '<embed alt="%(alt)s" embedtype="image" format="right" id="%(id)s"/>' % { 'alt': theimage.alt, 'id': theimage.id, } newsitem.body = imagestring + newsitem.body tobesaved = True imageerrordict = dict( (k, v) for k, v in imageerrors.iteritems() if v) if imageerrordict: images_errors.append({image: imageerrordict}) if tobesaved and save: newsitem.save() errordict = dict((k, v) for k, v in itemerrors.iteritems() if v) if errordict: errors.append({item: errordict}) return errors, images_errors
def doimport(**kwargs): save = kwargs.get('save', False) path = kwargs.get('path', PATH) image_path = kwargs.get('image_path', IMAGE_PATH) show_index = SHOW_INDEX tree = ET.parse(path) root = tree.getroot() errors = {} images_errors = [] dept_count = 0 total_students = 0 new_count = 0 student_save_count = 0 for d in root.findall('department'): dept_count += 1 page = d.find('page') depterrors = {} dept_title, depterrors['title'] = text_from_elem(page, 'title') specialism = '' print '\nNow importing: ' + repr(dept_title) if dept_title in PROGRAMME_SPECIALISMS.keys(): dept_title, specialism = PROGRAMME_SPECIALISMS[dept_title] print 'dept: ' + repr(dept_title) theprogramme = PROGRAMMES[dept_title] print 'prog: ' + repr(theprogramme) theschool = SCHOOLS[dept_title] print 'scho: ' + repr(theschool) h = html2text.HTML2Text() h.body_width = 0 try: blurb = page.find('texts').findall('text')[0].find('content') except AttributeError: blurb = page.find('synopsis') blurb = h.handle(blurb.text).strip() print "Blurb: " + repr(blurb) print "******* note that the above text will not be imported *******" student_count = 0 for s in d.findall('student'): student_count += 1 s = s.find('studentpage') sp_contentid = s.attrib['contentid'] try: sp = StudentPage.objects.get(rca_content_id=sp_contentid) except StudentPage.DoesNotExist: sp = StudentPage(rca_content_id=sp_contentid) sp_errs = {} sp.title, sp_errs['title'] = text_from_elem(s, 'title', length=255) # there is no intro text in any of the data at time of writing # intro, sp_errs['intro'] = text_from_elem(s, 'intro') sp.slug = make_slug(sp) statement = richtext_from_elem(s.find('statement')) statement_text, sponsors, collaborators = statement_extract(statement) sp.statement = statement_text sp.work_description = statement_text # handle the metadata fields metadata = s.find('metadata') # format the current degree sp.degree_year, sp_errs['deg_year'] = text_from_elem(metadata, 'year', length=255) degree_subject, sp_errs['deg_subj'] = text_from_elem(metadata, 'degrees', length=255) if degree_subject[-1] == '?': degree_subject = degree_subject[:-1] sp.degree_subject = DEGREE_SUBJECTS[degree_subject] degree_qualification, sp_errs['deg_qual'] = text_from_elem(metadata, 'degree', length=255) sp.degree_qualification = degree_qualification.lower() # metadata contains first and last names in separate fields sp.first_name, sp_errs['first_name'] = text_from_elem(metadata, 'firstname', length=255) sp.last_name, sp_errs['last_name'] = text_from_elem(metadata, 'surname', length=255) # we worked out the programme and school earlier from the dept_page sp.programme = theprogramme sp.school = theschool if not specialism and metadata.find('specialism') is not None: sp.specialism, sp_errs['specialism'] = text_from_elem(metadata, 'specialism') else: sp.specialism = specialism # handle profile image try: profile_image = RcaImage.objects.get(rca_content_id=sp_contentid + 'profile_image') except RcaImage.DoesNotExist: profile_image = RcaImage(rca_content_id=sp_contentid + 'profile_image') profile_filename = slugify(unicode(sp.title)).replace('-','_') profile_image_path = image_path + "show_2013_profiles/2400_" + sp.programme + "/" profile_image.title = sp.title + ' profile image' if not profile_image.id: try: with File(open(normalize("NFKD", profile_image_path + profile_filename + '.jpg'), 'rb')) as f: profile_image.file = f if save: profile_image.save() except IOError as e: try: with File(open(normalize("NFKD", profile_image_path + profile_filename + '.png'), 'rb')) as f: profile_image.file = f if save: profile_image.save() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) + " " + profile_image_path + profile_filename sp_errs['image_not_found'] = profile_image_path + profile_filename except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: if save: profile_image.save() sp.profile_image = profile_image # save the studentpage for foreignkey purposes if save: student_save_count += 1 if sp.id: sp.save() else: new_count += 1 show_index.add_child(sp) elif not sp.id: new_count += 1 # handle the sponsors and collaborators from earlier for spon in sponsors: name, sp_errs['sponsors'] = check_length(spon, 255) if save: sponpage = StudentPageWorkSponsor(page=sp, name=name) sponpage.save() for col in collaborators: name, sp_errs['collaborators'] = check_length(col, 255) if save: colpage = StudentPageWorkCollaborator(page=sp, name=name) colpage.save() # handle the cv fields cv = s.find('cv') sp_errs['degree'] = cv_handle( cv, 'degrees', StudentPageDegree, sp, length=255, fieldname='degree', save=save) sp_errs['exhibition'] = cv_handle( cv, 'exhibition', StudentPageExhibition, sp, length=255, save=save) sp_errs['experience'] = cv_handle( cv, 'experience', StudentPageExperience, sp, length=255, save=save) sp_errs['awards'] = cv_handle( cv, 'awards', StudentPageAwards, sp, length=255, fieldname='award', save=save) if cv.find('sponsors') is not None: sp_errs['sponsors'] = cv_handle( cv, 'sponsors', StudentPageWorkSponsor, sp, length=255, fieldname='name', save=save) # currently the model doesn't have publications or conferences #sp_errs['publications'] = cv_handle( # cv, 'publications', StudentPagePublications, sp, length=255) #sp_errs['conferences'] = cv_handle( # cv, 'conferences', StudentPageConferences, sp, length=255) if s.find('emails') is not None: for emailaddress in s.find('emails').getchildren(): emailtext = emailaddress.text.strip() if save: StudentPageContactsEmail.objects.get_or_create(page=sp, email=emailtext) if s.find('phonenumbers') is not None: for num in s.find('phonenumbers').getchildren(): if num.text: phonenumber = num.text.strip() if save: StudentPageContactsPhone.objects.get_or_create(page=sp, phone=phonenumber) if s.find('urls') is not None: for url in s.find('urls').getchildren(): if url.text: urltext = url.text.strip() if save: StudentPageContactsWebsite.objects.get_or_create(page=sp, website=urltext) # handle images tag images = s.find('images') forloop_counter = 0 if images is not None: for image in images.findall('image'): forloop_counter += 1 imageerrors = {} metadata = image.find('imagemetadata') im_contentid = image.attrib['contentid'] if not im_contentid: im_contentid = sp_contentid + '_image_' + str(forloop_counter) try: theimage = RcaImage.objects.get(rca_content_id=im_contentid) except RcaImage.DoesNotExist: theimage = RcaImage(rca_content_id=im_contentid) theimage.title, imageerrors['title'] = text_from_elem(metadata, 'title', length=255, textify=True) theimage.creator, imageerrors['creator'] = text_from_elem(metadata, 'creator', length=255, textify=True) theimage.medium, imageerrors['medium'] = text_from_elem(metadata, 'media', length=255, textify=True) photographer, imageerrors['photographer'] = text_from_elem(metadata, 'photographer', length=255) if photographer.strip().startswith('©'): photographer = photographer.replace('©', '').strip() theimage.photographer = photographer theimage.permissions, imageerrors['permissions'] = text_from_elem(metadata, 'rights', length=255) caption, imageerrors['caption'] = text_from_elem(metadata, 'caption', length=255, textify=True) theimage.alt = caption #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255) #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255) filename = unicode(urllib2.unquote(image.find('filename').text.strip())) image_success = False full_image_path = image_path + '2400_' + sp.programme + "/" if not theimage.id: try: with File(open(normalize("NFKD", full_image_path + filename), 'rb')) as f: theimage.file = f if save: theimage.save() image_success = True except IOError as e: try: with File(open(normalize("NFKD", full_image_path + filename[:-4] + '.png'), 'rb')) as f: theimage.file = f if save: theimage.save() image_success = True except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print full_image_path + filename imageerrors['image_not_found'] = full_image_path + filename except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: if save: theimage.save() image_success = True if save and image_success: StudentPageCarouselItem.objects.get_or_create(page=sp, image=theimage) newimageerrordict = dict((k, v) for k, v in imageerrors.iteritems() if v) if newimageerrordict: images_errors.append({image: newimageerrordict}) errordict = dict((k, v) for k, v in sp_errs.iteritems() if v) if errordict: depterrors[sp.title] = errordict errordict = dict((k, v) for k, v in depterrors.iteritems() if v) if errordict: errors[theprogramme] = errordict print "%(student_count)s students" % { 'student_count': student_count } total_students += student_count print "%(d)s departments imported, total %(s)s students, %(sv)s saved (%(n)s new)" % { 'd': dept_count, 's': total_students, 'sv': student_save_count, 'n': new_count, } profile_not_found_count = 0 image_not_found_count = 0 for dept, depterrors in errors.iteritems(): print '\n' + dept + '\n' + '='*len(dept) for name, sp_errs in depterrors.iteritems(): if isinstance(sp_errs, dict): print name print sp_errs['image_not_found'] profile_not_found_count += 1 print '\nImage errors\n============' for image_dict in images_errors: for image, error_dict in image_dict.iteritems(): if isinstance(error_dict, dict): print error_dict['image_not_found'] image_not_found_count += 1 print str(profile_not_found_count) + " profile images not found" print str(image_not_found_count) + " artwork images not found" print '\n\n' return images_errors, errors
def doimport(**kwargs): path = kwargs.get('path', PATH) save = kwargs.get('save', False) image_path = kwargs.get('image_path', IMAGE_PATH) ruthless = kwargs.get('ruthless', False) newsindex = NEWS_INDEX tree = ET.parse(path) root = tree.getroot() errors = [] images_errors = [] for item in root.findall('news_item'): itemerrors = {} # sort out what instance this is news_contentid = item.attrib['contentid'] title, itemerrors['title'] = text_from_elem(item, 'title', length=255) date = parse_date(item.find('goinglivedate').text.strip().replace('.','-')) or datetime.date.today() try: newsitem = NewsItem.objects.get(rca_content_id=news_contentid) except NewsItem.DoesNotExist: newsitem = NewsItem(rca_content_id=news_contentid) newsitem.title = title newsitem.date = date newsitem.intro = richtext_from_elem(item.find('intro')) newsitem.slug = make_slug(newsitem) # possibly delete any images that are embedded in the existing body if ruthless: soup = BeautifulSoup(newsitem.body, 'html.parser') to_delete_ids = [] for x in soup.find_all('embed'): try: to_delete_ids.append(int(x.attrs['id'])) except ValueError: pass if to_delete_ids: RcaImage.objects.filter(id__in=to_delete_ids).delete() # build the body strings = [] if item.find('texts'): for elem in item.find('texts').findall('text'): html = richtext_from_elem(elem.find('content')) strings.append(html) newsitem.body = '\n'.join(strings) # save newsitem if save: if newsitem.id: newsitem.save() else: newsindex.add_child(newsitem) tobesaved = False if item.find('images') is not None: # first delete images that haven't got a contentid if ruthless: for c in NewsItemCarouselItem.objects.filter(page=newsitem): c.image.delete() c.delete() for image in item.find('images').findall('image'): imageerrors = {} metadata = image.find('imagemetadata') im_contentid = image.attrib['contentid'] filename = urllib2.unquote(image.find('filename').text.strip()) try: theimage = RcaImage.objects.get(rca_content_id=im_contentid) except RcaImage.DoesNotExist: theimage = RcaImage(rca_content_id=im_contentid) theimage.title, imageerrors['title'] = text_from_elem(metadata, 'title', length=255, textify=True) theimage.creator, imageerrors['creator'] = text_from_elem(metadata, 'creator', length=255, textify=True) theimage.medium, imageerrors['medium'] = text_from_elem(metadata, 'media', length=255, textify=True) theimage.photographer, imageerrors['photog'] = text_from_elem(metadata, 'photographer', length=255, textify=True) theimage.permission, imageerrors['perms'] = text_from_elem(metadata, 'rights', length=255, textify=True) caption, imageerrors['caption'] = text_from_elem(metadata, 'caption', length=255, textify=True) theimage.alt = caption #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255) #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255) try: with File(open(image_path + filename.encode('utf-8'), 'r')) as f: if theimage.id: if save: theimage.delete() theimage.file = f if save: theimage.save() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print repr(filename) except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise if save and theimage.is_landscape(): try: carousel = NewsItemCarouselItem.objects.get( page = newsitem, image = theimage, ) except NewsItemCarouselItem.DoesNotExist: carousel = NewsItemCarouselItem( page = newsitem, image = theimage, ) if save: carousel.save() elif save and theimage.id: imagestring = '<embed alt="%(alt)s" embedtype="image" format="right" id="%(id)s"/>' % { 'alt': theimage.alt, 'id': theimage.id, } newsitem.body = imagestring + newsitem.body tobesaved = True imageerrordict = dict((k, v) for k, v in imageerrors.iteritems() if v) if imageerrordict: images_errors.append({image: imageerrordict}) if tobesaved and save: newsitem.save() errordict = dict((k, v) for k, v in itemerrors.iteritems() if v) if errordict: errors.append({item: errordict}) return errors, images_errors
def doimport(**kwargs): save = kwargs.get('save', False) path = kwargs.get('path', PATH) image_path = kwargs.get('image_path', IMAGE_PATH) show_index = SHOW_INDEX tree = ET.parse(path) root = tree.getroot() errors = {} images_errors = [] dept_count = 0 total_students = 0 new_count = 0 student_save_count = 0 for d in root.findall('department'): dept_count += 1 page = d.find('page') depterrors = {} dept_title, depterrors['title'] = text_from_elem(page, 'title') specialism = '' print '\nNow importing: ' + repr(dept_title) if dept_title in PROGRAMME_SPECIALISMS.keys(): dept_title, specialism = PROGRAMME_SPECIALISMS[dept_title] print 'dept: ' + repr(dept_title) theprogramme = PROGRAMMES[dept_title] print 'prog: ' + repr(theprogramme) theschool = SCHOOLS[dept_title] print 'scho: ' + repr(theschool) h = html2text.HTML2Text() h.body_width = 0 try: blurb = page.find('texts').findall('text')[0].find('content') except AttributeError: blurb = page.find('synopsis') blurb = h.handle(blurb.text).strip() print "Blurb: " + repr(blurb) print "******* note that the above text will not be imported *******" student_count = 0 for s in d.findall('student'): student_count += 1 s = s.find('studentpage') sp_contentid = s.attrib['contentid'] try: sp = StudentPage.objects.get(rca_content_id=sp_contentid) except StudentPage.DoesNotExist: sp = StudentPage(rca_content_id=sp_contentid) sp_errs = {} sp.title, sp_errs['title'] = text_from_elem(s, 'title', length=255) # there is no intro text in any of the data at time of writing # intro, sp_errs['intro'] = text_from_elem(s, 'intro') sp.slug = make_slug(sp) statement = richtext_from_elem(s.find('statement')) statement_text, sponsors, collaborators = statement_extract( statement) sp.statement = statement_text sp.work_description = statement_text # handle the metadata fields metadata = s.find('metadata') # format the current degree sp.degree_year, sp_errs['deg_year'] = text_from_elem(metadata, 'year', length=255) degree_subject, sp_errs['deg_subj'] = text_from_elem(metadata, 'degrees', length=255) if degree_subject[-1] == '?': degree_subject = degree_subject[:-1] sp.degree_subject = DEGREE_SUBJECTS[degree_subject] degree_qualification, sp_errs['deg_qual'] = text_from_elem( metadata, 'degree', length=255) sp.degree_qualification = degree_qualification.lower() # metadata contains first and last names in separate fields sp.first_name, sp_errs['first_name'] = text_from_elem(metadata, 'firstname', length=255) sp.last_name, sp_errs['last_name'] = text_from_elem(metadata, 'surname', length=255) # we worked out the programme and school earlier from the dept_page sp.programme = theprogramme sp.school = theschool if not specialism and metadata.find('specialism') is not None: sp.specialism, sp_errs['specialism'] = text_from_elem( metadata, 'specialism') else: sp.specialism = specialism # handle profile image try: profile_image = RcaImage.objects.get( rca_content_id=sp_contentid + 'profile_image') except RcaImage.DoesNotExist: profile_image = RcaImage(rca_content_id=sp_contentid + 'profile_image') profile_filename = slugify(unicode(sp.title)).replace('-', '_') profile_image_path = image_path + "show_2013_profiles/2400_" + sp.programme + "/" profile_image.title = sp.title + ' profile image' if not profile_image.id: try: with File( open( normalize( "NFKD", profile_image_path + profile_filename + '.jpg'), 'rb')) as f: profile_image.file = f if save: profile_image.save() except IOError as e: try: with File( open( normalize( "NFKD", profile_image_path + profile_filename + '.png'), 'rb')) as f: profile_image.file = f if save: profile_image.save() except IOError as e: print "I/O error({0}): {1}".format( e.errno, e.strerror ) + " " + profile_image_path + profile_filename sp_errs[ 'image_not_found'] = profile_image_path + profile_filename except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: if save: profile_image.save() sp.profile_image = profile_image # save the studentpage for foreignkey purposes if save: student_save_count += 1 if sp.id: sp.save() else: new_count += 1 show_index.add_child(sp) elif not sp.id: new_count += 1 # handle the sponsors and collaborators from earlier for spon in sponsors: name, sp_errs['sponsors'] = check_length(spon, 255) if save: sponpage = StudentPageWorkSponsor(page=sp, name=name) sponpage.save() for col in collaborators: name, sp_errs['collaborators'] = check_length(col, 255) if save: colpage = StudentPageWorkCollaborator(page=sp, name=name) colpage.save() # handle the cv fields cv = s.find('cv') sp_errs['degree'] = cv_handle(cv, 'degrees', StudentPageDegree, sp, length=255, fieldname='degree', save=save) sp_errs['exhibition'] = cv_handle(cv, 'exhibition', StudentPageExhibition, sp, length=255, save=save) sp_errs['experience'] = cv_handle(cv, 'experience', StudentPageExperience, sp, length=255, save=save) sp_errs['awards'] = cv_handle(cv, 'awards', StudentPageAwards, sp, length=255, fieldname='award', save=save) if cv.find('sponsors') is not None: sp_errs['sponsors'] = cv_handle(cv, 'sponsors', StudentPageWorkSponsor, sp, length=255, fieldname='name', save=save) # currently the model doesn't have publications or conferences #sp_errs['publications'] = cv_handle( # cv, 'publications', StudentPagePublications, sp, length=255) #sp_errs['conferences'] = cv_handle( # cv, 'conferences', StudentPageConferences, sp, length=255) if s.find('emails') is not None: for emailaddress in s.find('emails').getchildren(): emailtext = emailaddress.text.strip() if save: StudentPageContactsEmail.objects.get_or_create( page=sp, email=emailtext) if s.find('phonenumbers') is not None: for num in s.find('phonenumbers').getchildren(): if num.text: phonenumber = num.text.strip() if save: StudentPageContactsPhone.objects.get_or_create( page=sp, phone=phonenumber) if s.find('urls') is not None: for url in s.find('urls').getchildren(): if url.text: urltext = url.text.strip() if save: StudentPageContactsWebsite.objects.get_or_create( page=sp, website=urltext) # handle images tag images = s.find('images') forloop_counter = 0 if images is not None: for image in images.findall('image'): forloop_counter += 1 imageerrors = {} metadata = image.find('imagemetadata') im_contentid = image.attrib['contentid'] if not im_contentid: im_contentid = sp_contentid + '_image_' + str( forloop_counter) try: theimage = RcaImage.objects.get( rca_content_id=im_contentid) except RcaImage.DoesNotExist: theimage = RcaImage(rca_content_id=im_contentid) theimage.title, imageerrors['title'] = text_from_elem( metadata, 'title', length=255, textify=True) theimage.creator, imageerrors['creator'] = text_from_elem( metadata, 'creator', length=255, textify=True) theimage.medium, imageerrors['medium'] = text_from_elem( metadata, 'media', length=255, textify=True) photographer, imageerrors['photographer'] = text_from_elem( metadata, 'photographer', length=255) if photographer.strip().startswith('©'): photographer = photographer.replace('©', '').strip() theimage.photographer = photographer theimage.permissions, imageerrors[ 'permissions'] = text_from_elem(metadata, 'rights', length=255) caption, imageerrors['caption'] = text_from_elem( metadata, 'caption', length=255, textify=True) theimage.alt = caption #theimage.width, imageerrors['width'] = text_from_elem(metadata, 'width', length=255) #theimage.height, imageerrors['height'] = text_from_elem(metadata, 'height', length=255) filename = unicode( urllib2.unquote(image.find('filename').text.strip())) image_success = False full_image_path = image_path + '2400_' + sp.programme + "/" if not theimage.id: try: with File( open( normalize("NFKD", full_image_path + filename), 'rb')) as f: theimage.file = f if save: theimage.save() image_success = True except IOError as e: try: with File( open( normalize( "NFKD", full_image_path + filename[:-4] + '.png'), 'rb')) as f: theimage.file = f if save: theimage.save() image_success = True except IOError as e: print "I/O error({0}): {1}".format( e.errno, e.strerror) print full_image_path + filename imageerrors[ 'image_not_found'] = full_image_path + filename except ValueError: print "Could not convert data to an integer." except: import sys print "Unexpected error:", sys.exc_info()[0] raise else: if save: theimage.save() image_success = True if save and image_success: StudentPageCarouselItem.objects.get_or_create( page=sp, image=theimage) newimageerrordict = dict( (k, v) for k, v in imageerrors.iteritems() if v) if newimageerrordict: images_errors.append({image: newimageerrordict}) errordict = dict((k, v) for k, v in sp_errs.iteritems() if v) if errordict: depterrors[sp.title] = errordict errordict = dict((k, v) for k, v in depterrors.iteritems() if v) if errordict: errors[theprogramme] = errordict print "%(student_count)s students" % {'student_count': student_count} total_students += student_count print "%(d)s departments imported, total %(s)s students, %(sv)s saved (%(n)s new)" % { 'd': dept_count, 's': total_students, 'sv': student_save_count, 'n': new_count, } profile_not_found_count = 0 image_not_found_count = 0 for dept, depterrors in errors.iteritems(): print '\n' + dept + '\n' + '=' * len(dept) for name, sp_errs in depterrors.iteritems(): if isinstance(sp_errs, dict): print name print sp_errs['image_not_found'] profile_not_found_count += 1 print '\nImage errors\n============' for image_dict in images_errors: for image, error_dict in image_dict.iteritems(): if isinstance(error_dict, dict): print error_dict['image_not_found'] image_not_found_count += 1 print str(profile_not_found_count) + " profile images not found" print str(image_not_found_count) + " artwork images not found" print '\n\n' return images_errors, errors