def test_inspect_wikipedia_text(self): inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*.txt') self.assertEquals(self._wikipedia_text_data_classes, inspector.get_data_classes()) self.assertEquals(self._wikipedia_text_files.sort(), inspector.get_match_files(WikipediaTextFileContentMatch).sort()) # With match pattern on all files, it will work to (but less speed) inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*') self.assertEquals(self._wikipedia_text_files.sort(), inspector.get_match_files(WikipediaTextFileContentMatch).sort())
def test_inspect_britannica_html(self): inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*.html') self.assertEquals(self._britannica_html_data_classes, inspector.get_data_classes()) self.assertEquals(self._britannica_html_files.sort(), inspector.get_match_files(BritannicaHTMLFileContentMatch).sort()) # With match pattern on all files, it will work to (but less speed) inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*') self.assertEquals(self._britannica_html_files.sort(), inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())
def _get_inspector(self, data_classes, match_pattern, source='tests/src/source_files'): return Inspector(source=source, data_classes=data_classes, match_pattern=match_pattern)
def test_errors_report(self): inspector = Inspector(source='tests/src/source_files_errors', data_classes=self._wikipedia_html_data_classes, match_pattern='*.html') extractor = Extractor(inspectors=[inspector]) data_collection = extractor.extract() errors = data_collection.get_errors() self.assertEquals(1, len(errors)) self.assertEquals(('tests/src/source_files_errors/error_unicode.html', 'match', "'utf-8' codec can't decode byte 0xe0 in position 6071: invalid " 'continuation byte'), errors[0].get_as_tuple())
def test_inspect_wikipedia_text(self): inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*.txt') self.assertEquals(self._wikipedia_text_data_classes, inspector.get_data_classes()) self.assertEquals( self._wikipedia_text_files.sort(), inspector.get_match_files(WikipediaTextFileContentMatch).sort()) # With match pattern on all files, it will work to (but less speed) inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*') self.assertEquals( self._wikipedia_text_files.sort(), inspector.get_match_files(WikipediaTextFileContentMatch).sort())
def test_inspect_britannica_html(self): inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*.html') self.assertEquals(self._britannica_html_data_classes, inspector.get_data_classes()) self.assertEquals( self._britannica_html_files.sort(), inspector.get_match_files(BritannicaHTMLFileContentMatch).sort()) # With match pattern on all files, it will work to (but less speed) inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*') self.assertEquals( self._britannica_html_files.sort(), inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())
source_directory = 'sandbox/dalz/Raw_Field_Blog/HTLML_complete/Blog_LaFraise/Blog_LaFraise/blog.lafraise.com/fr/' # Les différentes données qui vont être extraites data_classes = [ ArticleCommentCountFileData, AuthorArticleCountFilesData, ArticlePublicationDateFileData, ArticlePublicationHourFileData, ArticleAuthorFileData, ArticleWordCountFileData, CommentAuthorCommentCountFilesDatas, AuthorArticlesCommentsCountAverageFilesData, AuthorArticlesWordsCountAverageFilesData, ArticlePatriceCommentCountFileData ] # Création de l'objet chargé de récupérer les fichiers correspondant aux données recherchés inspector_lafraise = Inspector(source=source_directory, data_classes=data_classes, match_pattern='*.html') # Création de l'extracteur de données extractor = Extractor(inspectors=[inspector_lafraise]) # Extraction des données data_collection = extractor.extract() # Création de l'objet pour exporter en CSV. On précise la liste des données à compiler dans un fichier. csv_convector = CSVExporter(data_collection, implode_classes=[ArticleImplode, AuthorImplode]) # Export dans le répertoire output des différentes données extraites csv_convector.export('sandbox/dalz/output')