Example #1
0
    def test_inspect_wikipedia_text(self):
        inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*.txt')
        self.assertEquals(self._wikipedia_text_data_classes, inspector.get_data_classes())

        self.assertEquals(self._wikipedia_text_files.sort(),
                          inspector.get_match_files(WikipediaTextFileContentMatch).sort())

        # With match pattern on all files, it will work to (but less speed)
        inspector = Inspector('tests/src/source_files', self._wikipedia_text_data_classes, match_pattern='*')
        self.assertEquals(self._wikipedia_text_files.sort(),
                          inspector.get_match_files(WikipediaTextFileContentMatch).sort())
Example #2
0
    def test_inspect_britannica_html(self):
        inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*.html')
        self.assertEquals(self._britannica_html_data_classes, inspector.get_data_classes())

        self.assertEquals(self._britannica_html_files.sort(),
                          inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())

        # With match pattern on all files, it will work to (but less speed)
        inspector = Inspector('tests/src/source_files', self._britannica_html_data_classes, match_pattern='*')
        self.assertEquals(self._britannica_html_files.sort(),
                          inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())
Example #3
0
 def _get_inspector(self,
                    data_classes,
                    match_pattern,
                    source='tests/src/source_files'):
     return Inspector(source=source,
                      data_classes=data_classes,
                      match_pattern=match_pattern)
Example #4
0
    def test_errors_report(self):
        inspector = Inspector(source='tests/src/source_files_errors',
                              data_classes=self._wikipedia_html_data_classes,
                              match_pattern='*.html')
        extractor = Extractor(inspectors=[inspector])
        data_collection = extractor.extract()

        errors = data_collection.get_errors()
        self.assertEquals(1, len(errors))
        self.assertEquals(('tests/src/source_files_errors/error_unicode.html',
                           'match',
                           "'utf-8' codec can't decode byte 0xe0 in position 6071: invalid "
                           'continuation byte'), errors[0].get_as_tuple())
Example #5
0
    def test_inspect_wikipedia_text(self):
        inspector = Inspector('tests/src/source_files',
                              self._wikipedia_text_data_classes,
                              match_pattern='*.txt')
        self.assertEquals(self._wikipedia_text_data_classes,
                          inspector.get_data_classes())

        self.assertEquals(
            self._wikipedia_text_files.sort(),
            inspector.get_match_files(WikipediaTextFileContentMatch).sort())

        # With match pattern on all files, it will work to (but less speed)
        inspector = Inspector('tests/src/source_files',
                              self._wikipedia_text_data_classes,
                              match_pattern='*')
        self.assertEquals(
            self._wikipedia_text_files.sort(),
            inspector.get_match_files(WikipediaTextFileContentMatch).sort())
Example #6
0
    def test_inspect_britannica_html(self):
        inspector = Inspector('tests/src/source_files',
                              self._britannica_html_data_classes,
                              match_pattern='*.html')
        self.assertEquals(self._britannica_html_data_classes,
                          inspector.get_data_classes())

        self.assertEquals(
            self._britannica_html_files.sort(),
            inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())

        # With match pattern on all files, it will work to (but less speed)
        inspector = Inspector('tests/src/source_files',
                              self._britannica_html_data_classes,
                              match_pattern='*')
        self.assertEquals(
            self._britannica_html_files.sort(),
            inspector.get_match_files(BritannicaHTMLFileContentMatch).sort())
Example #7
0
source_directory = 'sandbox/dalz/Raw_Field_Blog/HTLML_complete/Blog_LaFraise/Blog_LaFraise/blog.lafraise.com/fr/'

# Les différentes données qui vont être extraites
data_classes = [
    ArticleCommentCountFileData, AuthorArticleCountFilesData,
    ArticlePublicationDateFileData, ArticlePublicationHourFileData,
    ArticleAuthorFileData, ArticleWordCountFileData,
    CommentAuthorCommentCountFilesDatas,
    AuthorArticlesCommentsCountAverageFilesData,
    AuthorArticlesWordsCountAverageFilesData,
    ArticlePatriceCommentCountFileData
]

# Création de l'objet chargé de récupérer les fichiers correspondant aux données recherchés
inspector_lafraise = Inspector(source=source_directory,
                               data_classes=data_classes,
                               match_pattern='*.html')

# Création de l'extracteur de données
extractor = Extractor(inspectors=[inspector_lafraise])

# Extraction des données
data_collection = extractor.extract()

# Création de l'objet pour exporter en CSV. On précise la liste des données à compiler dans un fichier.
csv_convector = CSVExporter(data_collection,
                            implode_classes=[ArticleImplode, AuthorImplode])

# Export dans le répertoire output des différentes données extraites
csv_convector.export('sandbox/dalz/output')