Beispiel #1
0
 def __init__(self, name):
     self.archive = Archive(name)
     self.master = MasterArchive()
     self.lib = Library()
Beispiel #2
0
class SavedRecordParser:
    
    def __init__(self, name):
        self.archive = Archive(name)
        self.master = MasterArchive()
        self.lib = Library()

    def __iter__(self):
        return iter(self.archive)

    def __getitem__(self, index):
        return self.archive.__getitem__(index)

    def add_pdf(self, path):
        self.archive.add_pdf(path)

    def get_text(self, text, start, stop):
        regexp = "\n%s (.*?)\n%s " % (start.upper(), stop.upper())
        match = re.compile(regexp, re.DOTALL).search(text)
        if not match:
            return None

        return match.groups()[0].strip()

    def exclude_entry(self, entry, exclude):
        for exc in exclude:
            if exc in entry:
                return True
        return False

    def get_entry(self, attr, method=None, default=None, require=True, exclude=(), entries=()):
        set = getattr(self.article, "set_%s" % attr)
        str_arr = []
        for start, stop in entries:
            str_arr.append("%s->%s" % (start, stop))
            entry = self.get_text(self.block, start, stop)
            if entry and not self.exclude_entry(entry, exclude):
                if method:
                    entry = method(entry)

                set(entry)
                return

        if not default == None:
            set(default)
            return

        if require:
            sys.stderr.write("ERROR: %s\n" % self.block)
            msg = "no %s for tags\n" % attr
            msg += "\n".join(str_arr)
            raise ISIError(msg)
            
    def feed(self, text, notes):
        journals = {}
        blocks = re.compile("PT\sJ(.*?)\nER", re.DOTALL).findall(text)
        for block in blocks:
            try:
                self.block = block
                self.article = self.archive.create_article()

                get_number = lambda x: re.compile("(\d+)").search(x).groups()[0] 
                get_page = lambda x: Page(get_number(x))
                #clean_title = lambda x: clean_line(clean_entry(x))
                clean_title = Cleanup.clean_title

                self.get_entry("journal", entries=(("so", "la"), ("so", "ab"), ("so", "sn")) )
                self.get_entry("volume", method=int, entries=(("vl", "is"), ("vl", "bp")) )
                self.get_entry("issue", method=lambda x: int(get_number(x)), require=False, entries=(("is", "bp"),) )
                self.get_entry("start_page", method=get_page, exclude=("art. no.",), entries=(("bp", "ep"), ("bp", "ut"), ("ar", "di"), ("ar", "ut")) )
                self.get_entry("end_page", method=get_page, require=False, entries=(("ep", "di"), ("ep", "ut")) )


                self.get_entry("authors", method=lambda x: get_authors(x, "\n", ","), entries=(("af", "ti"), ("au", "ti"), ("au", "so")))

                self.get_entry("title", method=clean_title, entries=(("ti", "so"),) )
                self.get_entry("abstract", method=clean_entry, require=False, entries=(("ab", "sn"),) )
                self.get_entry("year", method=int, entries=(("py", "vl"), ("py", "tc") ) )

                self.get_entry("doi", require=False, entries=(("di", "pg"), ("di", "ut"),("di", "er")) )

                self.article.set_notes(notes)
                
                journal = ISIArticle.get_journal(self.article.get_journal())
                volume = self.article.get_volume()
                page = self.article.get_page()
                name = "%s %d %s" % (journal, volume, page)
                if not self.master.has(self.article):
                    self.archive.test_and_add(self.article)
                else:
                    println("%s exists in archive\n" % name)
                    continue
            except Exception, error:
                sys.stderr.write("ERROR: %s\n%s\n" % (error, block))