def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text = res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, unit): parser = PDFParser() self.index = [] article_lines = [] headline = "" doc = parser.load_document(self.options['file']) for i, p in enumerate(parser.process_document(doc)): #is this page an index page? index_pattern = re.compile("^[^\(]+\([^\)]+\)..+[0-9]+$") if any([ index_pattern.match(line.get_text()) for line in parser.get_textlines(p) ]): for line in parser.get_textlines(p): pattern = re.compile( "([^\(]+)(\([0-9]+\))? \(([^\)]+)\).+") text = line.get_text() result = pattern.search(text) if result: h = result.group(1) m = result.group(3) self.index.append((h, m)) continue #if not, scrape lines on page for current article for line in parser.get_textlines(p): text = line.get_text() if text.lower().strip() in [ i[0].lower().strip() for i in self.index ]: # title is recognized. yield old article, start new if len(headline) > 0: article = self.getarticle(headline, article_lines) yield article headline = text article_lines = [] article_lines.append(text) #last article yield self.getarticle(headline, article_lines)
def _scrape_unit(self, unit): parser = PDFParser() self.index = [] article_lines = [] headline = "" doc = parser.load_document(self.options['file']) for i, p in enumerate(parser.process_document(doc)): #is this page an index page? index_pattern = re.compile("^[^\(]+\([^\)]+\)..+[0-9]+$") if any([index_pattern.match(line.get_text()) for line in parser.get_textlines(p)]): for line in parser.get_textlines(p): pattern = re.compile("([^\(]+)(\([0-9]+\))? \(([^\)]+)\).+") text = line.get_text() result = pattern.search(text) if result: h = result.group(1); m = result.group(3) self.index.append((h, m)) continue #if not, scrape lines on page for current article for line in parser.get_textlines(p): text = line.get_text() if text.lower().strip() in [i[0].lower().strip() for i in self.index]: # title is recognized. yield old article, start new if len(headline) > 0: article = self.getarticle(headline, article_lines) yield article headline = text article_lines = [] article_lines.append(text) #last article yield self.getarticle(headline, article_lines)