def dividePdf(self): """ Cut up the volume into articles pdfs based on the data in the LIMB toc """ for _, articles in self.article_data.items(): for article in articles: hash_name = tools.getHashName(article["TitleDocMain"]) start_page = article["start_page"] end_page = article["end_page"] # legr: TODO : should probably subtract 1 from end_page output_name = tools.getArticleName(hash_name, start_page, end_page) output_path = os.path.join(self.pdf_output_dir, output_name) self.debug_message("creating file {0}".format(output_path)) # if our call to pdftk fails, get out quickly if not tools.cutPdf(self.pdf_path, output_path, start_page, end_page): error = "PDF division failed. Input file: {0}, " "start page: {1}, end page: {2}" error = error.format(self.pdf_path, start_page, end_page) raise IOError(error) return None
def createArticleXML(self, doc, article, date_published): ''' Given an article dict, create the OJS XML corresponding to this data ''' #======================================================================= # Create article stub with title #======================================================================= article_tag = doc.createElement('article') doc_language = tools.convertLangToLocale(article['DocLanguage']) article_tag.setAttribute('locale',doc_language) article_tag.setAttribute('language',article['DocLanguage']) title_tag = self.createXMLTextTag(doc, 'title', article['TitleDocMain']) article_tag.appendChild(title_tag) #======================================================================= # Add DBC-id to article #======================================================================= ''' this code causes OJS to panic - we don't know why it should be here. so we're killing it. dbc_marcx_id = article['dbcMarcxID'] if 'dbcMarcxID' in article else '' dbc_id_tag = self.createXMLTextTag(doc, 'id', dbc_marcx_id) dbc_id_tag.setAttribute('type','dbcMarcxID') article_tag.appendChild(dbc_id_tag) ''' #======================================================================= # Add page range #======================================================================= start_page = article['start_page'] end_page = article['end_page'] # legr: Calculate offset so real start- and end page can be forwarded to OJS offset = self.get_calculated_offset() if offset == 0: print("Warning: offset = 0. Maybe all pages are uncounted?") else: print("Information: Calculated offset value: {0}".format(offset)) page_range = "{0}-{1}".format(start_page + offset, end_page + offset) pages_tag = self.createXMLTextTag(doc, 'pages', page_range) article_tag.appendChild(pages_tag) #======================================================================= # Add date published tag #======================================================================= published_tag = self.createXMLTextTag(doc, 'date_published', date_published) article_tag.appendChild(published_tag) #======================================================================= # Add authors #======================================================================= # don't add an author tag if we don't have one (e.g. Front Matter) if 'Author' in article: #Author is a list of zero, one or multuple authors for author in article['Author']: author_tag = self.createAuthorXML(doc, author) article_tag.appendChild(author_tag) #======================================================================= # Add subjects # see http://pkp.sfu.ca/wiki/index.php/Importing_and_Exporting_Data#Creating_the_XML_Import_File #======================================================================= if 'Subject' in article: #Author is a list of zero, one or multuple authors indexing_tag = doc.createElement('indexing') subjects = '' if isinstance(article['Subject'],list): subjects = ';'.join(article['Subject']) else: subjects = article['Subject'] subject_tag = self.createXMLTextTag(doc,'subject',subjects) subject_tag.setAttribute('locale',tools.convertLangToLocale('da')) indexing_tag.appendChild(subject_tag) article_tag.appendChild(indexing_tag) #======================================================================= # Add pdf-link #======================================================================= md5_hash = tools.getHashName(article['TitleDocMain']) pdf_name = tools.getArticleName(md5_hash, start_page,end_page) galley_tag = self.createGalleyXML(doc, pdf_name) article_tag.appendChild(galley_tag) return article_tag