def __init__(self, name): self.archive = Archive(name) self.master = MasterArchive() self.lib = Library()
class SavedRecordParser: def __init__(self, name): self.archive = Archive(name) self.master = MasterArchive() self.lib = Library() def __iter__(self): return iter(self.archive) def __getitem__(self, index): return self.archive.__getitem__(index) def add_pdf(self, path): self.archive.add_pdf(path) def get_text(self, text, start, stop): regexp = "\n%s (.*?)\n%s " % (start.upper(), stop.upper()) match = re.compile(regexp, re.DOTALL).search(text) if not match: return None return match.groups()[0].strip() def exclude_entry(self, entry, exclude): for exc in exclude: if exc in entry: return True return False def get_entry(self, attr, method=None, default=None, require=True, exclude=(), entries=()): set = getattr(self.article, "set_%s" % attr) str_arr = [] for start, stop in entries: str_arr.append("%s->%s" % (start, stop)) entry = self.get_text(self.block, start, stop) if entry and not self.exclude_entry(entry, exclude): if method: entry = method(entry) set(entry) return if not default == None: set(default) return if require: sys.stderr.write("ERROR: %s\n" % self.block) msg = "no %s for tags\n" % attr msg += "\n".join(str_arr) raise ISIError(msg) def feed(self, text, notes): journals = {} blocks = re.compile("PT\sJ(.*?)\nER", re.DOTALL).findall(text) for block in blocks: try: self.block = block self.article = self.archive.create_article() get_number = lambda x: re.compile("(\d+)").search(x).groups()[0] get_page = lambda x: Page(get_number(x)) #clean_title = lambda x: clean_line(clean_entry(x)) clean_title = Cleanup.clean_title self.get_entry("journal", entries=(("so", "la"), ("so", "ab"), ("so", "sn")) ) self.get_entry("volume", method=int, entries=(("vl", "is"), ("vl", "bp")) ) self.get_entry("issue", method=lambda x: int(get_number(x)), require=False, entries=(("is", "bp"),) ) self.get_entry("start_page", method=get_page, exclude=("art. no.",), entries=(("bp", "ep"), ("bp", "ut"), ("ar", "di"), ("ar", "ut")) ) self.get_entry("end_page", method=get_page, require=False, entries=(("ep", "di"), ("ep", "ut")) ) self.get_entry("authors", method=lambda x: get_authors(x, "\n", ","), entries=(("af", "ti"), ("au", "ti"), ("au", "so"))) self.get_entry("title", method=clean_title, entries=(("ti", "so"),) ) self.get_entry("abstract", method=clean_entry, require=False, entries=(("ab", "sn"),) ) self.get_entry("year", method=int, entries=(("py", "vl"), ("py", "tc") ) ) self.get_entry("doi", require=False, entries=(("di", "pg"), ("di", "ut"),("di", "er")) ) self.article.set_notes(notes) journal = ISIArticle.get_journal(self.article.get_journal()) volume = self.article.get_volume() page = self.article.get_page() name = "%s %d %s" % (journal, volume, page) if not self.master.has(self.article): self.archive.test_and_add(self.article) else: println("%s exists in archive\n" % name) continue except Exception, error: sys.stderr.write("ERROR: %s\n%s\n" % (error, block))