예제 #1
0
파일: raw_pdf.py 프로젝트: BBie/amcat
 def _scrape_unit(self, _file):
     """unit: a pdf document"""
     res = ""
     parser = PDFParser()
     doc = parser.load_document(_file, self.options['pdf_password'])
     for page in parser.process_document(doc):
         page_txt = ""
         for line in parser.get_textlines(page):
             page_txt += line.get_text() + "\n"
         res += page_txt + "\n\n"
     article = Article(text = res)
     article.headline = self.getheadline(_file)
     article.medium = self.options['medium']
     article.section = self.options['section']
     if self.options['date']:
         article.date = self.options['date']
     else:
         article.date = date.today()
     yield article
예제 #2
0
파일: raw_pdf.py 프로젝트: isususi/amcat
 def _scrape_unit(self, _file):
     """unit: a pdf document"""
     res = ""
     parser = PDFParser()
     doc = parser.load_document(_file, self.options['pdf_password'])
     for page in parser.process_document(doc):
         page_txt = ""
         for line in parser.get_textlines(page):
             page_txt += line.get_text() + "\n"
         res += page_txt + "\n\n"
     article = Article(text=res)
     article.headline = self.getheadline(_file)
     article.medium = self.options['medium']
     article.section = self.options['section']
     if self.options['date']:
         article.date = self.options['date']
     else:
         article.date = date.today()
     yield article
예제 #3
0
    def _scrape_unit(self, unit):
        parser = PDFParser()
        self.index = []
        article_lines = []
        headline = ""
        doc = parser.load_document(self.options['file'])
        for i, p in enumerate(parser.process_document(doc)):
            #is this page an index page?
            index_pattern = re.compile("^[^\(]+\([^\)]+\)..+[0-9]+$")
            if any([
                    index_pattern.match(line.get_text())
                    for line in parser.get_textlines(p)
            ]):
                for line in parser.get_textlines(p):
                    pattern = re.compile(
                        "([^\(]+)(\([0-9]+\))? \(([^\)]+)\).+")
                    text = line.get_text()
                    result = pattern.search(text)
                    if result:
                        h = result.group(1)
                        m = result.group(3)
                        self.index.append((h, m))
                continue

            #if not, scrape lines on page for current article
            for line in parser.get_textlines(p):
                text = line.get_text()
                if text.lower().strip() in [
                        i[0].lower().strip() for i in self.index
                ]:

                    # title is recognized. yield old article, start new
                    if len(headline) > 0:
                        article = self.getarticle(headline, article_lines)
                        yield article

                    headline = text
                    article_lines = []

                article_lines.append(text)

            #last article
            yield self.getarticle(headline, article_lines)
    def _scrape_unit(self, unit):
        parser = PDFParser()
        self.index = []
        article_lines = []
        headline = ""
        doc = parser.load_document(self.options['file'])
        for i, p in enumerate(parser.process_document(doc)):
            #is this page an index page?
            index_pattern = re.compile("^[^\(]+\([^\)]+\)..+[0-9]+$")
            if any([index_pattern.match(line.get_text()) for line in parser.get_textlines(p)]):
                for line in parser.get_textlines(p):
                    pattern = re.compile("([^\(]+)(\([0-9]+\))? \(([^\)]+)\).+")
                    text = line.get_text()
                    result = pattern.search(text)
                    if result:
                        h = result.group(1); m = result.group(3)
                        self.index.append((h, m))
                continue

            #if not, scrape lines on page for current article
            for line in parser.get_textlines(p):
                text = line.get_text()
                if text.lower().strip() in [i[0].lower().strip() for i in self.index]:

                    # title is recognized. yield old article, start new
                    if len(headline) > 0:
                        article =  self.getarticle(headline, article_lines)
                        yield article
                        

                    headline = text
                    article_lines = []
                                
                article_lines.append(text)
                
            #last article
            yield self.getarticle(headline, article_lines)