Ejemplo n.º 1
0
 def __init__(self):
     self.title = ''
     self.authors = None
     self.abstract = ''
     self.keywords = []
     self.glossary = {}
     self.subtitles = [] # Depriciated, use sections!
     self.section = None
     self.sections = []
     self.paragraphs = []
     self.figures = Figures()
     self.tables = Tables()
     self.references = []
     self.structure = [] # Defines the overall structure of the article.
     self.bibliography = Bibliography()
     self.citations = Citations()
     self.content = None
     self.connect = False
Ejemplo n.º 2
0
class Article(object):
    def __init__(self):
        self.title = ''
        self.authors = None
        self.abstract = ''
        self.keywords = []
        self.glossary = {}
        self.subtitles = [] # Depriciated, use sections!
        self.section = None
        self.sections = []
        self.paragraphs = []
        self.figures = Figures()
        self.tables = Tables()
        self.references = []
        self.structure = [] # Defines the overall structure of the article.
        self.bibliography = Bibliography()
        self.citations = Citations()
        self.content = None
        self.connect = False

    def referencing(self, numbered=False, brackets=False, rest=True, connect=False):
        """This function goes through all the paragraph and replaces the
        references for printing purposes."""
        if numbered: # Numbered referencing style.
            self.citations.numbered = True
        if brackets:
            self.citations.brackets = True
        if rest:
           self.citations.rest = True
        if connect:
            self.connect=True
            
        mapping = {}
        text = ''

        def replace(finding):
            match = finding.group(0)
            print "Found match at:", match, match[1:-1], mapping #paragraph.references
            if "Figure" in match or "Table" in match:
                items = match.split('; ')
                for item in items:
                     print item                
            if match[1:-1] in mapping: #paragraph.references
                #print "match in finding"
                #print "match is:", match[1:-1]
                if "Figure" in match or "Table" in match:
                    #print "mapping is", match, match[1:-1]

                    #print "Identified %s as match" % match, mapping, mapping[match[1:-1]]
                    citationstyle = "(%s)"
                else:
                    if not numbered:
                        citationstyle = "(%s)"
                    else:
                        citationstyle = "[%s]"
                return citationstyle % mapping[match[1:-1]] #paragraph.references
            else:
                return match

        for paragraph in self.paragraphs:
            #print paragraph, type(paragraph)
            # find []
            # use regex to replace all in place citations.
            pattern = "\[(.+?)\]"
            findings = re.findall(pattern, unicode(paragraph))
            for finding in findings:
                if finding.startswith('Figure'): #finding in self.figures or
                    items = finding.replace('; ', ';').split(';')
                    for item in items:
                        figurename = item.replace('Figure: ', '')
                        self.figures.order.append(figurename)
                        numberedfigure = "Figure %s" % (self.figures.order.index(figurename)+1)
                        paragraph.references[item] = numberedfigure
                        #self.figures[figurename].title = numberedfigure
                elif finding in self.tables or finding.startswith('Table'):
                    pass
#                    items = finding.replace('; ', ';').split(';')
#                    for item in items:
#                        tablename = item.replace('Table: ', '')
#                        m = re.findall(' (\w)$', tablename) # http://stackoverflow.com/questions/2362471/match-start-and-end-of-file-in-python-with-regex
#                        if m:
#                            panel = m[0]
#                            print "Panel =", panel
#                            tablename = tablename[:-2]
#                            print "Tabelname =", tablename
#                        else:
#                            panel = ''
#                        self.tables.order.append(tablename)
#                        try:
#                            numberedtable = "Table %s" % (self.tables.order.index(tablename)+1)
#                            print "Numbered table =", numberedtable
#                            paragraph.references[item] = numberedtable+panel
#                            print "Assigning %s to %s" % (item, numberedtable+panel)
#                            #self.tables[tablename] = numberedtable
#                        except Exception as e:
#                            print "Failed to refer to table: ", e
                elif finding.startswith('http://'): continue # Is link
                else:
                    paragraph.references[finding] = Refs(numbered, self.citations)
                #print finding

                regex = '(\d,\d|\d-\d)' #http://stackoverflow.com/questions/8609597/python-regular-expressions-or 
                # This regex is insufficient as it false tries to map references like those: [BMJ, 326, 1297-1299 (2003); BMJ, 337, a399 (2008)]
                m = re.findall(regex, finding)
                regex = '(\w,\s\d)'
                am = re.findall(regex, finding)
                #passed = False
                #while not passed:
                if not am and (m or (isdigit(finding) and int(finding) < 500)):
                    print "Found m"
                    items = finding.split(',')
                    #print items
                    for item in items[:]:
                        if "-" in item:
                            start, end = item.split('-')
                            #try: 
                            items.extend(xrange(int(start), int(end)+1))
                            #except: passed = True
                            #print items
                    for item in map(str, items):
                        #print item
                        if item in self.references:
                            #print "item in references", item, type(item)
                            r = self.references[item]
                            paragraph.references[finding].append(r)
                            mapping = paragraph.references
                            self.citations.add(r)
                            text = re.sub(pattern, replace, unicode(paragraph)) #http://stackoverflow.com/questions/3997525/python-replace-with-regex
                        else:
                            pass
                            #paragraph.references[finding].append(item)
                            #mapping = paragraph.references
                            #text = re.sub(pattern, replace, str(paragraph))
                    paragraph.text = text
                    #print text
                    #passed = True
                else:
                    #print "did not found m"
                    #print paragraph.references[finding]
                    items = finding.replace('; ', ';').split(';')
                    for item in items:
                        #print "item", item
                        if item in self.figures or item.startswith('Figure'):
                            #print "found figure again"
                            pass#continue #print "FOUND and IMAGE"
                        elif item in self.tables or item.startswith('Table'): pass#continue
                        elif item.startswith('http://'): continue # Is link
                        try:
                            #print "Trying to map i pmid"
                            id = int(item)
                            #print "id =", id, type(id)

                            r = self.bibliography.find(id, printing=False)
                            #print "Found reference:", r[0].ref()
                            self.citations.add(r[0])
                            #print "added citation to citations:", len(self.citations), self.citations
                            paragraph.references[finding].append(r[0])
                            #print "Appended reference to paragraph references."
                            mapping = paragraph.references
                            #print "Mapped paragaph"
                            #pattern = "\[.+?\]"
                            
                            text = re.sub(pattern, replace, unicode(paragraph))
                            #print "paragraph is now subsitututed:", text

                        except Exception as e:
                            print "Exception occured:", e
                            print item, "is not a pmid"
                            item = item.replace('.,', '.').replace(' and ', ' & ')
                            print self.references
                            if item in self.references:
                                r = self.references[item]
                                print "found reference", r
                                paragraph.references[finding].append(r or item)
                                #print paragraph.references[finding] ##
                                mapping = paragraph.references
                                #paragraph.text =
                                self.citations.add(r)
                                text = re.sub(pattern, replace, unicode(paragraph))
                                
                                #print len(self.citations), self.citations
                            elif item.replace('Figure: ' , '') in self.figures:
                                #print "Figure need to be handled", item, paragraph.references[finding]
                                mapping = paragraph.references
                                text = re.sub(pattern, replace, unicode(paragraph))
                                #print item, "is not in references."
                            elif item.replace('Table: ', '') in self.tables or item.replace('Table: ', '')[:-1] in self.tables:
                                mapping = paragraph.references
                                text = re.sub(pattern, replace, unicode(paragraph))
                            else:
                                print "Ref did not match anything:", finding
                                if "Table: " in item: # I don't know why this has to be here?
                                    mapping = paragraph.references
                                    text = re.sub(pattern, replace, unicode(paragraph))
                                else:
                                    try:
                                        if len(item) > 5: # Too short will probaly match something weird.
                                            # Search direct against Entrez and retrieve pubmed id:
                                            ids = self.bibliography.search(term=item)
                                            if len(ids) == 1:
                                                r = self.bibliography.find(int(ids[0]), printing=False)
                                                #print r
                                                self.citations.add(r[0])
                                                #print r[0]
                                                paragraph.references[finding].append(r[0])
                                                mapping = paragraph.references
                                                #print paragraph.references
                                                text = re.sub(pattern, replace, unicode(paragraph))
                                               #print text
                                            else:
                                                 raise Exception
                                        else:
                                             raise Exception
                                    except:                                    
                                        try:
                                            # Assumed to be alredy a reference:
                                            paragraph.references[finding].append(item)
                                            text = re.sub(pattern, replace, unicode(paragraph))
                                            #print finding, paragraph.references[finding]
                                        except Exception as e:
                                            print e, finding, "is no reference."
                                
                    paragraph.text = text or paragraph.text
                    for term in LATIN:
                       paragraph.text = paragraph.text.replace(" %s " % term, " *%s* " % term).replace("(%s)" % term, " (*%s*)" % term)
                        


                        # Check whether reference is in references
                        # Create a reference instance
                        # declare type of representation
                        # insert representation as requested.
                        # pmid?

    ##                    def replace(finding):
    ##                        match = finding.group(0)
    ##                        print "match:", match
    ##                        if match[1:-1] in mapping:
    ##                            print "match in mapping"
    ##                            return mapping[match[1:-1]]
    ##                        else:
    ##                            return match
                        
                    #r = Reference(Entrez.read(Entrez.esummary(db='pubmed', id=id)))
                    #print "fetching reference"
                    
                    #print r
                    #print type(r)
                    ##print r.ref()#, findings
                    #print "mapping reference"
                    #mapping[finding] = r.ref()
                    #print id
                    #print type(r[0].ref())
                    #print "ref()", r[0].ref(), r[0].ref(), r[0].ref()
                    #print mapping[finding]
  
##                    #except:
##                        # It is not a pmid:
##                        if  "et al." in item:
##                            name, year = item.split('et al.')
##                            name, year = name.strip(), year.strip()
##                            for reference in self.references:
##                                if name in reference and year in reference:
##                                    mapping[finding] = reference
##                                    print "mapping:", mapping
##                                    #print reference
##                                    print "substituitng"
##                                    new_paragraph = re.sub("\[(.+?)\]", replace, str(paragraph))#, len(re.sub("\[(.+?)\]", replace, str(paragraph)))
##                                    print new_paragraph
                      
    def glossaring(self):
        """Creates an automatic generated glossary."""
        for paragraph in self.paragraphs:
            findings = re.findall("[A-Z]{2,}", unicode(paragraph)) #More than two capital letters.
            #print findings
            
            abbreviation_explained = re.findall("[a-z]?[A-Z]{2,}[a-z]? \(.*?\)", unicode(paragraph)) #"([a-z]{1})?[A-Z]{2,}([a-z]{1})? \(.*?\)"
            for abbreviation_explaination in abbreviation_explained:
                #print abbreviation_explaination
                abbreviation = abbreviation_explaination.split(' (')[0]
                explaination = abbreviation_explaination.split('(')[1].split(')')[0]
                self.glossary[abbreviation] = explaination
                
            explaination_abbreviated = re.findall("\(([a-z]?[A-Z]{2,}[a-z]?)\)", unicode(paragraph))
            #print explaination_abbreviated
            for explaination_abbreviation in explaination_abbreviated:
                number_of_words = len(explaination_abbreviation)
                if explaination_abbreviation[-1] == 's': # Plural
                    number_of_words -= 1
                # Form a regular expression to fetch the explaination in front the abbreviation:
                word = '[A-Z,a-z,0-9]+ '
                regex = '%s\([a-z]?[A-Z]{2,}[a-z]?\)' % (word * number_of_words)
                #print regex
                results = re.findall(regex, unicode(paragraph))
                for result in results:
                    explaination = result.split('(')[0]
                    abbreviation = result.split(' (')[1].split(')')[0]
                    self.glossary[abbreviation] = explaination
                    #print result

    def counting(self):
        """Counts the character lenght of each section."""
        self.count = 0
        for section in self.sections:
            section.count = 0
            for paragraph in section.paragraphs:
                section.count += paragraph.count
            for subsection in section.subsections:
                section.count += len(subsection.title.replace('**', '*').replace('*', ''))
                for paragraph in subsection.paragraphs:
                    subsection.count += paragraph.count
                    section.count += paragraph.count
            print section.title, section.count
                        

    def structuring(self, type='Console', emphasis='**', count=False, rest=True, connect=False):
        print("Structuring")
        figuresAndTables = False # Will be set to True if there is a Tables a Figures section and prevent the generation of automtaic table and figures section.
        if count: self.counting()
        structure = self.structure
        if self.title: structure.append(emphasis+unicode(self.title)+emphasis)
        if self.abstract:
            structure.append('\n'+emphasis+'Abstract:'+emphasis)
            if count: structure[-1] += ' [%s]' % self.abstract.count
            structure.append('%s' % self.abstract)
        if self.keywords:
            structure.append('\n'+emphasis+'Keywords'+emphasis+': %s' % self.keywords)
        if self.content:
            structure.append('\n'+emphasis+'Content:'+emphasis)
            structure.append(str(self.content))
        #print("Glossary: %s" % self.glossary)
        if self.glossary:
            #print("Has glossary")
            structure.append('\n'+emphasis+'Glossary:'+emphasis)
            if count: structure[-1] += ' [%s]' % len(self.glossary)
            abbreviations = [structure.append('%s = %s' % (k, v)) for k,v in self.glossary.items()] #.title()
        structure.append('')
        if self.sections:
            for section in self.sections:
                structure.append(emphasis+unicode(section)+emphasis)
                if count: structure[-1] += ' [%s]' % section.count
                if section.paragraphs:
                    for paragraph in section.paragraphs:
                        if not rest: structure.append('%s\n' % unicode(paragraph).replace('**\n', '.** '))
                if section.subsections:
                    for subsection in section.subsections:
                        structure.append(emphasis+str(subsection)+emphasis)
                        if count: structure[-1] += ' [%s]' % subsection.count
                        for paragraph in subsection.paragraphs:
                            if not rest: structure.append('%s\n' % unicode(paragraph).replace('**\n', '.** '))
                if section.title == "Figures & Tables":
                    if count:
                        structure[-1] = structure[-1][:-3]+' [%s] [%s]' % (len(self.figures), len(self.tables))
                        #structure[-1] += ' [%s] [%s]' % (len(self.figures), len(self.tables))

                    structure.append(unicode(self.figures) or '')
                    structure.append(unicode(self.tables) or '')
                    figuresAndTables = True
##                else:
##                    for paragraph in section.paragraphs:
##                        structure.append('%s\n' % paragraph)
        else:
            structure.append('\n\n'.join(map(unicode, self.paragraphs))) # Call referencing here.

        if rest:
            if self.connect:
                structure.append('\n============\nBibliography\n============\n')
            else:
                structure.append('\nReferences\n==========') # ReST

        else: structure.append('\n'+emphasis+'References'+emphasis)
        if count: structure[-1] += ' [%s]' % len(self.citations)
        #structure.append(str("\n".join(self.references)))
        structure.append(str(self.citations))

        if not figuresAndTables:
            structure.append('')
            if self.figures:
                structure.append('**Figures**')
                structure.append(unicode(self.figures) or '')
                structure.append('')
            if self.tables:
                structure.append('**Tables**')
                structure.append('')
                structure.append(unicode(self.tables) or '')

        if rest:
            #print("rest")
            #print structure
            for index, part in enumerate(structure[:]):
                if "Glossary" in part:
                    #print("Glossary in structure")
                    abbreviations = "\n".join(["%s\n    %s" % (k, v) for k, v in self.glossary.items()])
                    #print(abbreviations)
                    structure[index].replace('========\nGlossary\n========', '========\nGlossary\n========\n%s' % abbreviations)

        return structure


    def _string(self):
        """Transforms structure into a single string."""
        if not self.structure: self.structuring()
        return '\n'.join(self.structure)
    string = property(_string)

    def __repr__(self):
        """Prints brief consentive summary of the article
        Number of sections, parapgraphs and total word count"""
        return self.string

    def __unicode__(self):
        return self.__repr__()

    def __len__(self):
        """Character count."""
        return len(self.string)

    def meta(self):
        """Return meta-information such as  table of conent"""

    def __str__(self):
        return self.string

    def findCitation(self):
        """Proof of concept whether a pmid is sufficient."""

    def output(self):
        article.glossaring()
        article.referencing()
        print self

    def printIt(self):
        for section in self.sections:
            print section
            print
            for subsection in section.subsections:
                    print subsection
                    print
                    #print len(subsection.paragraphs)
                    for paragraph in subsection.paragraphs:
                            print paragraph
                            print
    def word(self, input=None, output=None, close=False, insert=False):
        """Creates a Word output document."""
        if input:
            word = Word(input)
        else:
            word = Word()
        if insert: word.insert(self.string)
        else: word.write(self.string)
        if output:
            word.save(output)
        else:
            word.save()
        if close:
            word.close()


    def wd(self, input=None, output=None, close=False, insert=False):
        """Creates a Word output document including emphasis.
        Note this is unfortunaly very slow."""
        if input:
            word = Word(input)
        else:
            word = Word()
        #word.visible()
            
        strings = self.string.split('\n')
        strings.reverse()
        for string in strings:
            print string
            if "*" in string:
                word.insertEmphasis(string+'\n')
            else:
                word.insert(string+'\n')
        if output:
            word.save(output)
        else:
            word.save()
        if close:
            word.close()

    def wordIt(self, input=None, output=None, close=False):
        """Creates a Word output document including emphasis.
        Note this is unfortunaly very slow."""
        if input:
            word = Word(input)
        else:
            word = Word()

        for figure in self.figures.values():
            figure.filename =  os.path.join(os.getcwd(), figure.filename).replace('\\', '/') #self.name[:-3]

        word.i(self.string)
        for figure in self.figures.values():
            try:
                word.insertImage(figure.filename)
            except Exception as e:
                print e
                pass

        for table in self.tables.values():
            if not table.raw:
                word.insertTable(table)
        
                #figure.filename =  os.path.join(os.getcwd(), self.name[:-3], figure.filename) 
                #word.insertImage(os.path.join(os.getcwd(), self.name[:-3], figure.filename)) #__file__
##        if output:
##            try: word.save(output)
##            except: pass
##        else:
##            try: word.save()
##            except: pass
##        if close:
##            word.close()


    def docX(self):
        """Produces a docx representation of the article."""

        # Default set of relationships - these are the minimum components of a document
        relationships = relationshiplist()        
        
        # Make a new document tree - this is the main part of Word document.
        document = newdocument()

        # This xpath Location is where most interesting content lives
        docbody = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]

        docbody.append(heading(str(self.title), 1))
        docbody.append(heading('Abstract', 2))
        docbody.append(paragraph(str(self.abstract)))
        for section in self.sections:
            docbody.append(heading(str(section.title), 1))
            if section.paragraphs:
                for para in section.paragraphs:
                    docbody.append(paragraph(str(para), jc='both'))
            if section.subsections:
                for subsection in section.subsections:
                    docbody.append(heading(str(subsection), 2))
                    for para in subsection.paragraphs:
                        docbody.append(paragraph(str(para), jc='both'))


##        docbody.append(heading("Tables", 1))
##        for title, t in self.tables.items():
##            docbody.append(paragraph(title, style='ListNumber')) # ListBullet
##            docbody.append(table(str(t).split('\n')))
##
        # Add an image:
        relationships, picpara = picture(relationships, 'image1.png', 'Figure 1')
        docbody.append(picpara)

        # Search and replace:
        print 'Searching for soemthing in a paragraph ...',
        if search(docbody, 'the awesomeness'): print 'found it!'
        else: print 'nope.'

        print 'Searching for something in a heading ...',
        if search(docbody, '200 lines'): print 'found it!'
        else: print 'nope.'

        # Add a pagebreak:
        docbody.append(pagebreak(type='page', orient='portrait'))

        docbody.append(heading('References', 1))
        docbody.append(paragraph(str(self.citations)))
        
        # Create the properteies, contenttypes, and other support files:
        coreprops = coreproperties(title='Human Frontier Science Promotion',
                                   subject='Full Application',
                                   creator='Daniel Wuttke',
                                   keywords=self.keywords)
        appprops = appproperties()
        contypes = contenttypes()
        webings = websettings()
        wordships = wordrelationships(relationships)

        # Save the document:
        savedocx(document, coreprops, appprops, contypes, webings, wordships,
                 'HFSP.docx')

    def show(self):
        """Opens the output of the article."""
        word = Word()
        word.show()
       
    def wordle(self):
        """Creates a wordle of a doucment"""
        from library.wc import wordCloud
        string = []
        for section in self.sections:
            string.append(str(section))
            for subsection in section.subsections:
                string.append(str(subsection))
        for paragraph in self.paragraphs:
            string.append(str(paragraph))
        string = " ".join(string)
        print "Wordle string:", string

        # Remove references:
        string = re.sub(r'\[.*?\]', '', string) #http://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
        
        wordCloud(string.replace('**', '').replace('*', '').replace(' et al.', ' ').replace('JPM', '').replace(',', '').replace(';', '').replace(':', '')) #self.string

    def contenting(self):
        """Builds a table of content for this article."""
        self.content = Content(self.sections)
        return self.content