Ejemplo n.º 1
0
    def __init__(self, mainAuthor, authorFile, publicationFile):

        # A manager for the Society
        self.m = Manager()
        self.m.info('A new scientific Society has been created.')

        # Content
        self.authors           = []
        self.author            = None
        self.authorUpdate      = True
        self.knownAuthors      = {}
        self.publications      = []
        self.knownPublications = {}

        # Data Files
        self.authorFile      = None
        self.publicationFile = None

        self._initFiles(authorFile, publicationFile)
        if self.author is None:
            self._addMainAuthor(mainAuthor)
Ejemplo n.º 2
0
from librarian import Manager


# from .items import Researcher
from .items import Society

if __name__ == "__main__":

    m = Manager()
    m.set_stdout()
    m.set_detail()
    m.set_overwrite()

    RGid             = '15420125'
    society_file     = 'society.{0}.json'.format(RGid)
    publication_file = 'publications.{0}.json'.format(RGid)
    journal_file     = 'journals.{0}.json'.format(RGid)
    update_journal   = True

    society = Society(RGid, society_file, publication_file)
    society.searchPublications(update_journal)
Ejemplo n.º 3
0
class Society(object):

    publATTR = ('li', {'class': 'li-publication'})

    def __init__(self, mainAuthor, authorFile, publicationFile):

        # A manager for the Society
        self.m = Manager()
        self.m.info('A new scientific Society has been created.')

        # Content
        self.authors           = []
        self.author            = None
        self.authorUpdate      = True
        self.knownAuthors      = {}
        self.publications      = []
        self.knownPublications = {}

        # Data Files
        self.authorFile      = None
        self.publicationFile = None

        self._initFiles(authorFile, publicationFile)
        if self.author is None:
            self._addMainAuthor(mainAuthor)

    def isAuthorInSociety(self, identifier):
        return int(identifier) in self.knownAuthors.keys()

    def isPublicationInSociety(self, identifier):
        return int(identifier) in self.knownPublications.keys()

    def getAuthor(self, identifier):
        return self.authors[self.knownAuthors[identifier]]

    def getPublication(self, identifier):
        return self.publications[self.knownPublications[identifier]]

    def searchPublications(self, updateJournal):
        r   = Researcher
        url = '{0}/profile/{1}/publications?sorting=newest&page={2}'
        i   = 1
        while len(self.publications) < self.author['pub_num']:
            soup = URL2soup(url.format(r.baseURL, self.author['profileName'], i))
            for p in soup(self.publATTR[0], attrs=self.publATTR[1]):
                self._addPublication(p, updateJournal)
            i += 1

    def _addPublication(self, soup, updateJournal):
        identifier = int(soup['id'].split('_')[-1])
        if not self.isPublicationInSociety(identifier):
            self.m.info('Capturing new publication {0}.'.format(identifier))
            publication = Publication(identifier, soup)
            for i in range(len(publication.authors)):
                a = publication.authors[i]
                if isinstance(a, int):
                    a = self._trainAuthor(a)['identifier']
                else:
                    a = self.author['identifier']
                publication.authors[i] = r2p(a)
            publication.authors[-1]['corresponding'] = True
            self.publications.append(publication.__dict__)
            self.knownPublications[identifier] = len(self.publications) - 1
        else:
            self.m.info('Updating data of {0}'.format(identifier))

        publication = self.getPublication(identifier)
        # update PI and citation data
        

    def _initFiles(self, authorFile, publicationFile):
        self._openFile(ftype = 'authors',      filename = authorFile)
        self._openFile(ftype = 'publications', filename = publicationFile)

    def _addMainAuthor(self, identifier):
        ma = self._trainAuthor(identifier, mainAuthor = True)
        if self.author is not None:
            if ma == self.author:
                self.authorUpdate = False
            if ma['pub_num'] != self.author['pub_num']:
                n = ma['pub_num'] - self.author['pub_num']
                self.m.info('There are {0} new publications!'.format(n))
            if ma['citations'] != self.author['citations']:
                n = ma['citations'] - self.author['citations']
                self.m.info('There are {0} new citations!'.format(n))
            if ma['impact'] != self.author['impact']:
                n = ma['impact'] - self.author['impact']
                self.m.info('The impact factor is up by {0}!'.format(n))
        self.author = ma

    def _trainAuthor(self, identifier, mainAuthor = False):
        if not mainAuthor and self.isAuthorInSociety(identifier):
            ostring = 'Author {0} already belongs to the Society.'.format(identifier)
            self.m.info(ostring)
            return self.getAuthor(identifier)

        self.m.info('Getting data for author {0}.'.format(identifier))
        r = Researcher(identifier, mainAuthor).__dict__
        if not mainAuthor:
            self.authors.append(r)
            self.knownAuthors[r["identifier"]] = len(self.authors) - 1
        return r

    def _openFile(self, ftype, filename):
        if os.path.isfile(filename):
            self.m.info('A previous {0} registry is loaded'.format(ftype))
            fd = File(filename)
            if ftype == 'authors':
                self._loadAuthorRegistry(fd.readJSON())
            elif ftype == 'publications':
                self._loadPublicationRegistry(fd.readJSON())
            fd.close()
            fd.unregister()

        else:
            self.m.info('A new {0} registry is created'.format(ftype))
        if ftype == 'authors':
            self.authorFile = filename
            atexit.register(self._writeAuthorsFile)
        elif ftype == 'publications':
            self.publicationFile = filename
            atexit.register(self._writePublicationsFile)

    def _loadAuthorRegistry(self, jsonArray):
        self.authors = jsonArray
        self.author  = self.authors.pop(0)
        for i in range(len(self.authors)):
            self.knownAuthors[self.authors[i]['identifier']] = i

    def _loadPublicationRegistry(self, jsonArray):
        self.publications = jsonArray
        for i in range(len(self.publications)):
            self.knownPublications[self.publications[i]['identifier']] = i

    def _writeAuthorsFile(self):
        self.authors.insert(0, self.author)
        self.authorFile = File(self.authorFile, 'w')
        self.authorFile.write(json.dumps(self.authors, indent=2,
                                         separators=(',', ': ')))
        self.authorFile.close()

    def _writePublicationsFile(self):
        self.publicationFile = File(self.publicationFile, 'w')
        self.publicationFile.write(json.dumps(self.publications, indent=2,
                                              separators=(',', ': ')))
        self.publicationFile.close()
Ejemplo n.º 4
0
def URL2soup(url):
    m = Manager()
    m.debug('Reading URL: {0}'.format(url))
    f = urllib2.urlopen(url)
    return BeautifulSoup(f.read())