def run(self): # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html) nltk_path.append(join(config.paths["rawdata"], "nltk")) try: # Check which classes are valid depending on min_docs_per_class nbprint('Loading classes') self.load_valid_classes() # Load the documents with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) except (LookupError, FileNotFoundError): raise ImporterError( info, 'Directory "{}" does not contain the required corpus.'.format( nltk_path)) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def run(self): folder = join(config.paths["rawdata"], "atd") # List txt files try: files = listdir(folder) except FileNotFoundError: raise ImporterError(info, 'Directory "{}" does not exist'.format(folder)) # Keep only .txt files files = [file for file in files if file.split(".")[-1] == "txt"] # Check if files exist if len(files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') # Add files one by one with data.document_writer(self.info) as document_writer: docinfo = DocumentInfo(document_writer) for filename in ProgressIterator(files, doc_progress_label): if filename.split(".")[-1] != "txt": continue with open(join(folder, filename), "r", encoding="utf8") as file: text = file.read() docinfo.add_document(text) # Print Meta Information docinfo.save_meta(self.info)
def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Open archive self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip') self.import_archive() # Print Meta Info self.docinfo.save_meta(self.info)
def save_documents(self): nbprint('Saving documents') self.classinfo = ClassInfo() # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) for doc in self.documents: text = doc.content['subject'] class_id = self.classinfo.increase_class_count( doc.content['maincat']) self.docinfo.add_document(text, class_id)
def run(self): self.classinfo = ClassInfo() self.max_docs_per_cls = self.info['data_info'].get( 'max_docs_per_cls', None) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) self.add_data("ODPtweets-Mar17-29") self.add_data("ODPtweets-Apr12-24") # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Iterate all archives folder = join(config.paths["rawdata"], "tweetsla") archives = self.get_archives(folder) for idx, archive in enumerate(archives): nbprint('{}/{}: {}'.format(idx + 1, len(archives), archive)).push() self.archivepath = join(folder, archive) self.import_archive() nbprint.pop() # Print Meta Info self.docinfo.save_meta(self.info)
def run(self): self.classinfo = ClassInfo() filename = join(config.paths["rawdata"], "complaints/consumer_complaints.csv") with open(filename) as file: self.load_classes(file) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) with open(filename) as file: self.load_data(file) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
def run(self): # Get all files in the classic4 directory self.folder = join(config.paths["rawdata"], "classic4") try: self.files = listdir(self.folder) except FileNotFoundError: raise ImporterError( info, 'Directory "{}" does not exist'.format(self.folder)) # Remove .gitignore file from list self.files = [file for file in self.files if file != '.gitignore'] # Keep only files that start with a classname self.classnames = ['cacm', 'cisi', 'cran', 'med'] self.files = [ file for file in self.files if '.' in file and file.split('.')[0] in self.classnames ] # Check if files exits if len(self.files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def run(self): # Load data with sklearn nbprint('Loading raw files') self.rawdata = fetch_20newsgroups( data_home=join(config.paths['rawdata'], 'sklearn'), remove=tuple(self.info['data_info']['remove']), subset='all') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)