def __init__(self, doc_dir):
     convertion_style = "-raw"
     self._eventextractor = EventExtractor(doc_dir)
 	parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
class PeriodicExtractor(object):

    def __init__(self, doc_dir):
        convertion_style = "-raw"
        self._eventextractor = EventExtractor(doc_dir)
    	parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
        self._template_metadata = parse.xml_template_metadata()
        page = self._template_metadata['page']
        self._preparator = Preparator(doc_dir)
        self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
        self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
        self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')

    ## Event authors metadata extractor extends method to periodic author extractor
    def _author_metadata(self):
        self.authors = self._eventextractor._author_metadata()
        return self.authors

    def _abstract_metadata(self):
        regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)')
        self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize()
        return self.abstract

    def all_metadata(self):
        if self._preparator.doc_ext == '.pdf':
            try:
                pdf_embed_metadata = self._preparator.pdf_embed_metadata()
                self._pdf_num_pages = pdf_embed_metadata.numPages
            except:
                print 'Encripted document'
                self._pdf_num_pages = 0
        else:
            self._pdf_num_pages = 0

        metadata = {'author_metadata':      self._author_metadata(),
                    'abstract_metadata':    self._abstract_metadata(),
                    'number_pages':         self._pdf_num_pages
                    }
        try:
            self._preparator.remove_converted_document()
        except OSError:
            print 'Temporary document already removed..'
        return metadata
class TestEventExtractor(unittest.TestCase):
    def setUp(self):
        self.doc_dir = join(ROOT_PATH, "testdocs", "event", "1_pt-br.pdf")
        self.preparator = Preparator(self.doc_dir)
        self.extractor = EventExtractor(self.doc_dir)
        self.parse = Parser("event.xml")
        self.xml_template_metadata = self.parse.xml_template_metadata()

    def test_metadata_extractor_generates_metadata_dict(self):
        self.extractor.all_metadata() | should_not | be_empty

    def test_event_document_has_an_abstract_metadata_pattern_found_by_regex(self):
        doc = self.extractor._clean_onepage_doc
        matches = re.search(r"resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)", doc)
        matches.group() | should | start_with("resumo")
        self.extractor._abstract_metadata | should_not | equal_to("")

    def test_event_document_has_author_type_metadata(self):
        self.extractor._author_metadata() | should_not | be_empty

    def test_event_document_has_title_type_metadata(self):
        self.extractor._title_metadata() | should_not | be_empty
	def setUp(self):
		self.doc_dir = join(ROOT_PATH, 'testdocs', 'event', '1_pt-br.pdf')
		self.preparator = Preparator(self.doc_dir)
		self.extractor = EventExtractor(self.doc_dir)
		self.parse = Parser('event.xml')
		self.xml_template_metadata = self.parse.xml_template_metadata()
 def setUp(self):
     self.doc_dir = join(ROOT_PATH, "testdocs", "event", "1_pt-br.pdf")
     self.preparator = Preparator(self.doc_dir)
     self.extractor = EventExtractor(self.doc_dir)
     self.parse = Parser("event.xml")
     self.xml_template_metadata = self.parse.xml_template_metadata()