def __init__(self): self.title = '' self.authors = None self.abstract = '' self.keywords = [] self.glossary = {} self.subtitles = [] # Depriciated, use sections! self.section = None self.sections = [] self.paragraphs = [] self.figures = Figures() self.tables = Tables() self.references = [] self.structure = [] # Defines the overall structure of the article. self.bibliography = Bibliography() self.citations = Citations() self.content = None self.connect = False
class Article(object): def __init__(self): self.title = '' self.authors = None self.abstract = '' self.keywords = [] self.glossary = {} self.subtitles = [] # Depriciated, use sections! self.section = None self.sections = [] self.paragraphs = [] self.figures = Figures() self.tables = Tables() self.references = [] self.structure = [] # Defines the overall structure of the article. self.bibliography = Bibliography() self.citations = Citations() self.content = None self.connect = False def referencing(self, numbered=False, brackets=False, rest=True, connect=False): """This function goes through all the paragraph and replaces the references for printing purposes.""" if numbered: # Numbered referencing style. self.citations.numbered = True if brackets: self.citations.brackets = True if rest: self.citations.rest = True if connect: self.connect=True mapping = {} text = '' def replace(finding): match = finding.group(0) print "Found match at:", match, match[1:-1], mapping #paragraph.references if "Figure" in match or "Table" in match: items = match.split('; ') for item in items: print item if match[1:-1] in mapping: #paragraph.references #print "match in finding" #print "match is:", match[1:-1] if "Figure" in match or "Table" in match: #print "mapping is", match, match[1:-1] #print "Identified %s as match" % match, mapping, mapping[match[1:-1]] citationstyle = "(%s)" else: if not numbered: citationstyle = "(%s)" else: citationstyle = "[%s]" return citationstyle % mapping[match[1:-1]] #paragraph.references else: return match for paragraph in self.paragraphs: #print paragraph, type(paragraph) # find [] # use regex to replace all in place citations. pattern = "\[(.+?)\]" findings = re.findall(pattern, unicode(paragraph)) for finding in findings: if finding.startswith('Figure'): #finding in self.figures or items = finding.replace('; ', ';').split(';') for item in items: figurename = item.replace('Figure: ', '') self.figures.order.append(figurename) numberedfigure = "Figure %s" % (self.figures.order.index(figurename)+1) paragraph.references[item] = numberedfigure #self.figures[figurename].title = numberedfigure elif finding in self.tables or finding.startswith('Table'): pass # items = finding.replace('; ', ';').split(';') # for item in items: # tablename = item.replace('Table: ', '') # m = re.findall(' (\w)$', tablename) # http://stackoverflow.com/questions/2362471/match-start-and-end-of-file-in-python-with-regex # if m: # panel = m[0] # print "Panel =", panel # tablename = tablename[:-2] # print "Tabelname =", tablename # else: # panel = '' # self.tables.order.append(tablename) # try: # numberedtable = "Table %s" % (self.tables.order.index(tablename)+1) # print "Numbered table =", numberedtable # paragraph.references[item] = numberedtable+panel # print "Assigning %s to %s" % (item, numberedtable+panel) # #self.tables[tablename] = numberedtable # except Exception as e: # print "Failed to refer to table: ", e elif finding.startswith('http://'): continue # Is link else: paragraph.references[finding] = Refs(numbered, self.citations) #print finding regex = '(\d,\d|\d-\d)' #http://stackoverflow.com/questions/8609597/python-regular-expressions-or # This regex is insufficient as it false tries to map references like those: [BMJ, 326, 1297-1299 (2003); BMJ, 337, a399 (2008)] m = re.findall(regex, finding) regex = '(\w,\s\d)' am = re.findall(regex, finding) #passed = False #while not passed: if not am and (m or (isdigit(finding) and int(finding) < 500)): print "Found m" items = finding.split(',') #print items for item in items[:]: if "-" in item: start, end = item.split('-') #try: items.extend(xrange(int(start), int(end)+1)) #except: passed = True #print items for item in map(str, items): #print item if item in self.references: #print "item in references", item, type(item) r = self.references[item] paragraph.references[finding].append(r) mapping = paragraph.references self.citations.add(r) text = re.sub(pattern, replace, unicode(paragraph)) #http://stackoverflow.com/questions/3997525/python-replace-with-regex else: pass #paragraph.references[finding].append(item) #mapping = paragraph.references #text = re.sub(pattern, replace, str(paragraph)) paragraph.text = text #print text #passed = True else: #print "did not found m" #print paragraph.references[finding] items = finding.replace('; ', ';').split(';') for item in items: #print "item", item if item in self.figures or item.startswith('Figure'): #print "found figure again" pass#continue #print "FOUND and IMAGE" elif item in self.tables or item.startswith('Table'): pass#continue elif item.startswith('http://'): continue # Is link try: #print "Trying to map i pmid" id = int(item) #print "id =", id, type(id) r = self.bibliography.find(id, printing=False) #print "Found reference:", r[0].ref() self.citations.add(r[0]) #print "added citation to citations:", len(self.citations), self.citations paragraph.references[finding].append(r[0]) #print "Appended reference to paragraph references." mapping = paragraph.references #print "Mapped paragaph" #pattern = "\[.+?\]" text = re.sub(pattern, replace, unicode(paragraph)) #print "paragraph is now subsitututed:", text except Exception as e: print "Exception occured:", e print item, "is not a pmid" item = item.replace('.,', '.').replace(' and ', ' & ') print self.references if item in self.references: r = self.references[item] print "found reference", r paragraph.references[finding].append(r or item) #print paragraph.references[finding] ## mapping = paragraph.references #paragraph.text = self.citations.add(r) text = re.sub(pattern, replace, unicode(paragraph)) #print len(self.citations), self.citations elif item.replace('Figure: ' , '') in self.figures: #print "Figure need to be handled", item, paragraph.references[finding] mapping = paragraph.references text = re.sub(pattern, replace, unicode(paragraph)) #print item, "is not in references." elif item.replace('Table: ', '') in self.tables or item.replace('Table: ', '')[:-1] in self.tables: mapping = paragraph.references text = re.sub(pattern, replace, unicode(paragraph)) else: print "Ref did not match anything:", finding if "Table: " in item: # I don't know why this has to be here? mapping = paragraph.references text = re.sub(pattern, replace, unicode(paragraph)) else: try: if len(item) > 5: # Too short will probaly match something weird. # Search direct against Entrez and retrieve pubmed id: ids = self.bibliography.search(term=item) if len(ids) == 1: r = self.bibliography.find(int(ids[0]), printing=False) #print r self.citations.add(r[0]) #print r[0] paragraph.references[finding].append(r[0]) mapping = paragraph.references #print paragraph.references text = re.sub(pattern, replace, unicode(paragraph)) #print text else: raise Exception else: raise Exception except: try: # Assumed to be alredy a reference: paragraph.references[finding].append(item) text = re.sub(pattern, replace, unicode(paragraph)) #print finding, paragraph.references[finding] except Exception as e: print e, finding, "is no reference." paragraph.text = text or paragraph.text for term in LATIN: paragraph.text = paragraph.text.replace(" %s " % term, " *%s* " % term).replace("(%s)" % term, " (*%s*)" % term) # Check whether reference is in references # Create a reference instance # declare type of representation # insert representation as requested. # pmid? ## def replace(finding): ## match = finding.group(0) ## print "match:", match ## if match[1:-1] in mapping: ## print "match in mapping" ## return mapping[match[1:-1]] ## else: ## return match #r = Reference(Entrez.read(Entrez.esummary(db='pubmed', id=id))) #print "fetching reference" #print r #print type(r) ##print r.ref()#, findings #print "mapping reference" #mapping[finding] = r.ref() #print id #print type(r[0].ref()) #print "ref()", r[0].ref(), r[0].ref(), r[0].ref() #print mapping[finding] ## #except: ## # It is not a pmid: ## if "et al." in item: ## name, year = item.split('et al.') ## name, year = name.strip(), year.strip() ## for reference in self.references: ## if name in reference and year in reference: ## mapping[finding] = reference ## print "mapping:", mapping ## #print reference ## print "substituitng" ## new_paragraph = re.sub("\[(.+?)\]", replace, str(paragraph))#, len(re.sub("\[(.+?)\]", replace, str(paragraph))) ## print new_paragraph def glossaring(self): """Creates an automatic generated glossary.""" for paragraph in self.paragraphs: findings = re.findall("[A-Z]{2,}", unicode(paragraph)) #More than two capital letters. #print findings abbreviation_explained = re.findall("[a-z]?[A-Z]{2,}[a-z]? \(.*?\)", unicode(paragraph)) #"([a-z]{1})?[A-Z]{2,}([a-z]{1})? \(.*?\)" for abbreviation_explaination in abbreviation_explained: #print abbreviation_explaination abbreviation = abbreviation_explaination.split(' (')[0] explaination = abbreviation_explaination.split('(')[1].split(')')[0] self.glossary[abbreviation] = explaination explaination_abbreviated = re.findall("\(([a-z]?[A-Z]{2,}[a-z]?)\)", unicode(paragraph)) #print explaination_abbreviated for explaination_abbreviation in explaination_abbreviated: number_of_words = len(explaination_abbreviation) if explaination_abbreviation[-1] == 's': # Plural number_of_words -= 1 # Form a regular expression to fetch the explaination in front the abbreviation: word = '[A-Z,a-z,0-9]+ ' regex = '%s\([a-z]?[A-Z]{2,}[a-z]?\)' % (word * number_of_words) #print regex results = re.findall(regex, unicode(paragraph)) for result in results: explaination = result.split('(')[0] abbreviation = result.split(' (')[1].split(')')[0] self.glossary[abbreviation] = explaination #print result def counting(self): """Counts the character lenght of each section.""" self.count = 0 for section in self.sections: section.count = 0 for paragraph in section.paragraphs: section.count += paragraph.count for subsection in section.subsections: section.count += len(subsection.title.replace('**', '*').replace('*', '')) for paragraph in subsection.paragraphs: subsection.count += paragraph.count section.count += paragraph.count print section.title, section.count def structuring(self, type='Console', emphasis='**', count=False, rest=True, connect=False): print("Structuring") figuresAndTables = False # Will be set to True if there is a Tables a Figures section and prevent the generation of automtaic table and figures section. if count: self.counting() structure = self.structure if self.title: structure.append(emphasis+unicode(self.title)+emphasis) if self.abstract: structure.append('\n'+emphasis+'Abstract:'+emphasis) if count: structure[-1] += ' [%s]' % self.abstract.count structure.append('%s' % self.abstract) if self.keywords: structure.append('\n'+emphasis+'Keywords'+emphasis+': %s' % self.keywords) if self.content: structure.append('\n'+emphasis+'Content:'+emphasis) structure.append(str(self.content)) #print("Glossary: %s" % self.glossary) if self.glossary: #print("Has glossary") structure.append('\n'+emphasis+'Glossary:'+emphasis) if count: structure[-1] += ' [%s]' % len(self.glossary) abbreviations = [structure.append('%s = %s' % (k, v)) for k,v in self.glossary.items()] #.title() structure.append('') if self.sections: for section in self.sections: structure.append(emphasis+unicode(section)+emphasis) if count: structure[-1] += ' [%s]' % section.count if section.paragraphs: for paragraph in section.paragraphs: if not rest: structure.append('%s\n' % unicode(paragraph).replace('**\n', '.** ')) if section.subsections: for subsection in section.subsections: structure.append(emphasis+str(subsection)+emphasis) if count: structure[-1] += ' [%s]' % subsection.count for paragraph in subsection.paragraphs: if not rest: structure.append('%s\n' % unicode(paragraph).replace('**\n', '.** ')) if section.title == "Figures & Tables": if count: structure[-1] = structure[-1][:-3]+' [%s] [%s]' % (len(self.figures), len(self.tables)) #structure[-1] += ' [%s] [%s]' % (len(self.figures), len(self.tables)) structure.append(unicode(self.figures) or '') structure.append(unicode(self.tables) or '') figuresAndTables = True ## else: ## for paragraph in section.paragraphs: ## structure.append('%s\n' % paragraph) else: structure.append('\n\n'.join(map(unicode, self.paragraphs))) # Call referencing here. if rest: if self.connect: structure.append('\n============\nBibliography\n============\n') else: structure.append('\nReferences\n==========') # ReST else: structure.append('\n'+emphasis+'References'+emphasis) if count: structure[-1] += ' [%s]' % len(self.citations) #structure.append(str("\n".join(self.references))) structure.append(str(self.citations)) if not figuresAndTables: structure.append('') if self.figures: structure.append('**Figures**') structure.append(unicode(self.figures) or '') structure.append('') if self.tables: structure.append('**Tables**') structure.append('') structure.append(unicode(self.tables) or '') if rest: #print("rest") #print structure for index, part in enumerate(structure[:]): if "Glossary" in part: #print("Glossary in structure") abbreviations = "\n".join(["%s\n %s" % (k, v) for k, v in self.glossary.items()]) #print(abbreviations) structure[index].replace('========\nGlossary\n========', '========\nGlossary\n========\n%s' % abbreviations) return structure def _string(self): """Transforms structure into a single string.""" if not self.structure: self.structuring() return '\n'.join(self.structure) string = property(_string) def __repr__(self): """Prints brief consentive summary of the article Number of sections, parapgraphs and total word count""" return self.string def __unicode__(self): return self.__repr__() def __len__(self): """Character count.""" return len(self.string) def meta(self): """Return meta-information such as table of conent""" def __str__(self): return self.string def findCitation(self): """Proof of concept whether a pmid is sufficient.""" def output(self): article.glossaring() article.referencing() print self def printIt(self): for section in self.sections: print section print for subsection in section.subsections: print subsection print #print len(subsection.paragraphs) for paragraph in subsection.paragraphs: print paragraph print def word(self, input=None, output=None, close=False, insert=False): """Creates a Word output document.""" if input: word = Word(input) else: word = Word() if insert: word.insert(self.string) else: word.write(self.string) if output: word.save(output) else: word.save() if close: word.close() def wd(self, input=None, output=None, close=False, insert=False): """Creates a Word output document including emphasis. Note this is unfortunaly very slow.""" if input: word = Word(input) else: word = Word() #word.visible() strings = self.string.split('\n') strings.reverse() for string in strings: print string if "*" in string: word.insertEmphasis(string+'\n') else: word.insert(string+'\n') if output: word.save(output) else: word.save() if close: word.close() def wordIt(self, input=None, output=None, close=False): """Creates a Word output document including emphasis. Note this is unfortunaly very slow.""" if input: word = Word(input) else: word = Word() for figure in self.figures.values(): figure.filename = os.path.join(os.getcwd(), figure.filename).replace('\\', '/') #self.name[:-3] word.i(self.string) for figure in self.figures.values(): try: word.insertImage(figure.filename) except Exception as e: print e pass for table in self.tables.values(): if not table.raw: word.insertTable(table) #figure.filename = os.path.join(os.getcwd(), self.name[:-3], figure.filename) #word.insertImage(os.path.join(os.getcwd(), self.name[:-3], figure.filename)) #__file__ ## if output: ## try: word.save(output) ## except: pass ## else: ## try: word.save() ## except: pass ## if close: ## word.close() def docX(self): """Produces a docx representation of the article.""" # Default set of relationships - these are the minimum components of a document relationships = relationshiplist() # Make a new document tree - this is the main part of Word document. document = newdocument() # This xpath Location is where most interesting content lives docbody = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0] docbody.append(heading(str(self.title), 1)) docbody.append(heading('Abstract', 2)) docbody.append(paragraph(str(self.abstract))) for section in self.sections: docbody.append(heading(str(section.title), 1)) if section.paragraphs: for para in section.paragraphs: docbody.append(paragraph(str(para), jc='both')) if section.subsections: for subsection in section.subsections: docbody.append(heading(str(subsection), 2)) for para in subsection.paragraphs: docbody.append(paragraph(str(para), jc='both')) ## docbody.append(heading("Tables", 1)) ## for title, t in self.tables.items(): ## docbody.append(paragraph(title, style='ListNumber')) # ListBullet ## docbody.append(table(str(t).split('\n'))) ## # Add an image: relationships, picpara = picture(relationships, 'image1.png', 'Figure 1') docbody.append(picpara) # Search and replace: print 'Searching for soemthing in a paragraph ...', if search(docbody, 'the awesomeness'): print 'found it!' else: print 'nope.' print 'Searching for something in a heading ...', if search(docbody, '200 lines'): print 'found it!' else: print 'nope.' # Add a pagebreak: docbody.append(pagebreak(type='page', orient='portrait')) docbody.append(heading('References', 1)) docbody.append(paragraph(str(self.citations))) # Create the properteies, contenttypes, and other support files: coreprops = coreproperties(title='Human Frontier Science Promotion', subject='Full Application', creator='Daniel Wuttke', keywords=self.keywords) appprops = appproperties() contypes = contenttypes() webings = websettings() wordships = wordrelationships(relationships) # Save the document: savedocx(document, coreprops, appprops, contypes, webings, wordships, 'HFSP.docx') def show(self): """Opens the output of the article.""" word = Word() word.show() def wordle(self): """Creates a wordle of a doucment""" from library.wc import wordCloud string = [] for section in self.sections: string.append(str(section)) for subsection in section.subsections: string.append(str(subsection)) for paragraph in self.paragraphs: string.append(str(paragraph)) string = " ".join(string) print "Wordle string:", string # Remove references: string = re.sub(r'\[.*?\]', '', string) #http://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex wordCloud(string.replace('**', '').replace('*', '').replace(' et al.', ' ').replace('JPM', '').replace(',', '').replace(';', '').replace(':', '')) #self.string def contenting(self): """Builds a table of content for this article.""" self.content = Content(self.sections) return self.content