Ejemplo n.º 1
0
    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Ejemplo n.º 2
0
    def run(self):
        folder = join(config.paths["rawdata"], "atd")

        # List txt files
        try:
            files = listdir(folder)
        except FileNotFoundError:
            raise ImporterError(info,
                                'Directory "{}" does not exist'.format(folder))

        # Keep only .txt files
        files = [file for file in files if file.split(".")[-1] == "txt"]

        # Check if files exist
        if len(files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        # Add files one by one
        with data.document_writer(self.info) as document_writer:
            docinfo = DocumentInfo(document_writer)
            for filename in ProgressIterator(files, doc_progress_label):
                if filename.split(".")[-1] != "txt":
                    continue
                with open(join(folder, filename), "r",
                          encoding="utf8") as file:
                    text = file.read()
                    docinfo.add_document(text)
            # Print Meta Information
            docinfo.save_meta(self.info)
Ejemplo n.º 3
0
 def run(self):
     # Open Writer
     with data.document_writer(self.info) as document_writer:
         self.docinfo = DocumentInfo(document_writer)
         
         # Open archive
         self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip')  
         self.import_archive()
         
         # Print Meta Info
         self.docinfo.save_meta(self.info)
Ejemplo n.º 4
0
    def save_documents(self):
        nbprint('Saving documents')

        self.classinfo = ClassInfo()

        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            for doc in self.documents:
                text = doc.content['subject']
                class_id = self.classinfo.increase_class_count(
                    doc.content['maincat'])
                self.docinfo.add_document(text, class_id)
Ejemplo n.º 5
0
    def run(self):
        self.classinfo = ClassInfo()
        self.max_docs_per_cls = self.info['data_info'].get(
            'max_docs_per_cls', None)
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            self.add_data("ODPtweets-Mar17-29")
            self.add_data("ODPtweets-Apr12-24")

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Ejemplo n.º 6
0
    def run(self):
        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)

            # Iterate all archives
            folder = join(config.paths["rawdata"], "tweetsla")
            archives = self.get_archives(folder)
            for idx, archive in enumerate(archives):
                nbprint('{}/{}: {}'.format(idx + 1, len(archives),
                                           archive)).push()
                self.archivepath = join(folder, archive)
                self.import_archive()
                nbprint.pop()

            # Print Meta Info
            self.docinfo.save_meta(self.info)
Ejemplo n.º 7
0
    def run(self):
        self.classinfo = ClassInfo()
        filename = join(config.paths["rawdata"],
                        "complaints/consumer_complaints.csv")

        with open(filename) as file:
            self.load_classes(file)

        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            with open(filename) as file:
                self.load_data(file)

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Ejemplo n.º 8
0
    def run(self):
        # Get all files in the classic4 directory
        self.folder = join(config.paths["rawdata"], "classic4")
        try:
            self.files = listdir(self.folder)
        except FileNotFoundError:
            raise ImporterError(
                info, 'Directory "{}" does not exist'.format(self.folder))

        # Remove .gitignore file from list
        self.files = [file for file in self.files if file != '.gitignore']

        # Keep only files that start with a classname
        self.classnames = ['cacm', 'cisi', 'cran', 'med']
        self.files = [
            file for file in self.files
            if '.' in file and file.split('.')[0] in self.classnames
        ]

        # Check if files exits
        if len(self.files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Ejemplo n.º 9
0
    def run(self):
        # Load data with sklearn
        nbprint('Loading raw files')
        self.rawdata = fetch_20newsgroups(
            data_home=join(config.paths['rawdata'], 'sklearn'),
            remove=tuple(self.info['data_info']['remove']),
            subset='all')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)