def run(self): # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html) nltk_path.append(join(config.paths["rawdata"], "nltk")) try: # Check which classes are valid depending on min_docs_per_class nbprint('Loading classes') self.load_valid_classes() # Load the documents with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) except (LookupError, FileNotFoundError): raise ImporterError( info, 'Directory "{}" does not contain the required corpus.'.format( nltk_path)) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
class NewsgroupImporter(ImporterBase): def load_documents(self): text_class_pairs = zip(self.rawdata.data, self.rawdata.target) for text, class_idx in ProgressIterator(text_class_pairs, doc_progress_label, length=len(self.rawdata.data)): classname = self.rawdata.target_names[class_idx] class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id) def run(self): # Load data with sklearn nbprint('Loading raw files') self.rawdata = fetch_20newsgroups( data_home=join(config.paths['rawdata'], 'sklearn'), remove=tuple(self.info['data_info']['remove']), subset='all') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Open archive self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip') self.import_archive() # Print Meta Info self.docinfo.save_meta(self.info)
class ClassicImporter(ImporterBase): def load_file(self, filename): filename = join(self.folder, filename) with open(filename, "r", encoding="utf8") as file: text = file.read() return text def load_documents(self): for filename in ProgressIterator(self.files, doc_progress_label): classname = filename.split(".")[0] class_id = self.classinfo.increase_class_count(classname) text = self.load_file(filename) self.docinfo.add_document(text, class_id) def run(self): # Get all files in the classic4 directory self.folder = join(config.paths["rawdata"], "classic4") try: self.files = listdir(self.folder) except FileNotFoundError: raise ImporterError( info, 'Directory "{}" does not exist'.format(self.folder)) # Remove .gitignore file from list self.files = [file for file in self.files if file != '.gitignore'] # Keep only files that start with a classname self.classnames = ['cacm', 'cisi', 'cran', 'med'] self.files = [ file for file in self.files if '.' in file and file.split('.')[0] in self.classnames ] # Check if files exits if len(self.files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
class YahooImporter(ImporterBase): def load_documents(self): nbprint('Loading xml file') self.documents = [] filename = join(config.paths["rawdata"], "yahooL5/manner.xml") #i, max = 0, 100000 current_doc = None for event, elem in etree.iterparse(filename, events=('start', 'end'), recover=True): #i = i+1 #if i % math.floor(max/10) == 0: # print(i/max) #if i > max: # break; if elem.tag == "document": if event == "start": current_doc = Document() elif event == "end": if current_doc.complete(): self.documents.append(current_doc) current_doc = None elif event == "end" and not current_doc is None: current_doc.add_elem(elem) def save_documents(self): nbprint('Saving documents') self.classinfo = ClassInfo() # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) for doc in self.documents: text = doc.content['subject'] class_id = self.classinfo.increase_class_count( doc.content['maincat']) self.docinfo.add_document(text, class_id) def run(self): self.load_documents() self.save_documents() # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
def save_documents(self): nbprint('Saving documents') self.classinfo = ClassInfo() # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) for doc in self.documents: text = doc.content['subject'] class_id = self.classinfo.increase_class_count( doc.content['maincat']) self.docinfo.add_document(text, class_id)
class ComplaintsImporter(ImporterBase): def load_classes(self, file): self.valid_classes = ClassInfo() min_length = self.info['data_info']['min_length'] cr = csv.reader(file) next(cr) for row in ProgressIterator(cr): classname = row[2] text = row[5] if len(text) >= min_length: self.valid_classes.increase_class_count(classname) min_class_size = self.info['data_info']['min_class_size'] self.valid_classes = [ c['info'] for c in self.valid_classes.make_class_list() if c['count'] > min_class_size ] def load_data(self, file): min_length = self.info['data_info']['min_length'] cr = csv.reader(file) next(cr) for row in ProgressIterator(cr): classname = row[2] text = row[5] if len(text) >= min_length and classname in self.valid_classes: class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id) def run(self): self.classinfo = ClassInfo() filename = join(config.paths["rawdata"], "complaints/consumer_complaints.csv") with open(filename) as file: self.load_classes(file) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) with open(filename) as file: self.load_data(file) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
class TweetsLAImporter(ImporterBase): def get_archives(self, folder): # List all files try: files = listdir(folder) except FileNotFoundError: raise ImporterError(info, 'Directory "{}" does not exist'.format(folder)) # Keep only .zip files archives = [file for file in files if file.split(".")[-1] == "zip"] return archives def parse_file(self, jsonfile): for line in ProgressIterator(jsonfile, 'Parsing tweets'): tweet = json.loads(line) if 'extended_tweet' in tweet: text = tweet['extended_tweet']['full_text'] elif 'text' in tweet: text = tweet['text'] else: continue self.docinfo.add_document(text) def import_archive(self): # Iterate all files in archive with zipfile.ZipFile(self.archivepath) as zip: filenames = [info.filename for info in zip.infolist()] for filename in filenames: nbprint(filename) with zip.open(filename) as jsonfile: self.parse_file(jsonfile) def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Iterate all archives folder = join(config.paths["rawdata"], "tweetsla") archives = self.get_archives(folder) for idx, archive in enumerate(archives): nbprint('{}/{}: {}'.format(idx + 1, len(archives), archive)).push() self.archivepath = join(folder, archive) self.import_archive() nbprint.pop() # Print Meta Info self.docinfo.save_meta(self.info)
def run(self): self.classinfo = ClassInfo() self.max_docs_per_cls = self.info['data_info'].get( 'max_docs_per_cls', None) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) self.add_data("ODPtweets-Mar17-29") self.add_data("ODPtweets-Apr12-24") # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Iterate all archives folder = join(config.paths["rawdata"], "tweetsla") archives = self.get_archives(folder) for idx, archive in enumerate(archives): nbprint('{}/{}: {}'.format(idx + 1, len(archives), archive)).push() self.archivepath = join(folder, archive) self.import_archive() nbprint.pop() # Print Meta Info self.docinfo.save_meta(self.info)
def run(self): self.classinfo = ClassInfo() filename = join(config.paths["rawdata"], "complaints/consumer_complaints.csv") with open(filename) as file: self.load_classes(file) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) with open(filename) as file: self.load_data(file) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
def run(self): folder = join(config.paths["rawdata"], "atd") # List txt files try: files = listdir(folder) except FileNotFoundError: raise ImporterError(info, 'Directory "{}" does not exist'.format(folder)) # Keep only .txt files files = [file for file in files if file.split(".")[-1] == "txt"] # Check if files exist if len(files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') # Add files one by one with data.document_writer(self.info) as document_writer: docinfo = DocumentInfo(document_writer) for filename in ProgressIterator(files, doc_progress_label): if filename.split(".")[-1] != "txt": continue with open(join(folder, filename), "r", encoding="utf8") as file: text = file.read() docinfo.add_document(text) # Print Meta Information docinfo.save_meta(self.info)
def run(self): # Get all files in the classic4 directory self.folder = join(config.paths["rawdata"], "classic4") try: self.files = listdir(self.folder) except FileNotFoundError: raise ImporterError( info, 'Directory "{}" does not exist'.format(self.folder)) # Remove .gitignore file from list self.files = [file for file in self.files if file != '.gitignore'] # Keep only files that start with a classname self.classnames = ['cacm', 'cisi', 'cran', 'med'] self.files = [ file for file in self.files if '.' in file and file.split('.')[0] in self.classnames ] # Check if files exits if len(self.files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def run(self): # Load data with sklearn nbprint('Loading raw files') self.rawdata = fetch_20newsgroups( data_home=join(config.paths['rawdata'], 'sklearn'), remove=tuple(self.info['data_info']['remove']), subset='all') with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) # Save classinfo classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
class ACMImporter(ImporterBase): def import_archive(self): # Iterate all files in archive with zipfile.ZipFile(self.archivepath) as zip: filenames = [info.filename for info in zip.infolist()] for filename in ProgressIterator(filenames): if filename.endswith('.txt'): with zip.open(filename, 'r') as txtfile: text = txtfile.read().decode('utf-8') self.docinfo.add_document(text) def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Open archive self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip') self.import_archive() # Print Meta Info self.docinfo.save_meta(self.info)
class TweetsODPImporter(ImporterBase): def parse_files(self, jsonfile): nbprint("Loading documents") for line in ProgressIterator(jsonfile): tweet = json.loads(line) text = tweet["full_text"] id = int(tweet["id_str"]) #id field is incorrect/rounded classname = self.id_to_classname[id] if (self.max_docs_per_cls is not None and self.classinfo.classes.get( classname, (0, 0))[1] >= self.max_docs_per_cls): continue else: class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id) def load_id_to_classname(self, folderpath, filename): nbprint("Extracting tsv") self.id_to_classname = {} max_depth = self.info['data_info']['maxdepth'] tarfilename = join(folderpath, filename + ".tar.bz2") with tarfile.open(tarfilename, "r:bz2") as tar: tsvfile = tar.extractfile(filename + ".tsv") for line in ProgressIterator(tsvfile): fields = line.decode().split() id = int(fields[0]) classname = fields[3] classname = classname.strip("*") classhierarchy = classname.split("/") classhierarchy = classhierarchy[1:max_depth + 1] classname = "/".join(classhierarchy) self.id_to_classname[id] = classname def add_data(self, filename): nbprint("Loading '{}'".format(filename)).push() folderpath = join(config.paths["rawdata"], "tweetsodp") jsonfilename = join(folderpath, filename + ".json") zipfilename = join(folderpath, filename + ".json.zip") self.load_id_to_classname(folderpath, filename) if isfile(jsonfilename): with open(jsonfilename, "r") as jsonfile: self.parse_files(jsonfile) else: with zipfile.ZipFile(zipfilename) as zip: with zip.open(filename + ".json") as jsonfile: self.parse_files(jsonfile) nbprint.pop() def run(self): self.classinfo = ClassInfo() self.max_docs_per_cls = self.info['data_info'].get( 'max_docs_per_cls', None) with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) self.add_data("ODPtweets-Mar17-29") self.add_data("ODPtweets-Apr12-24") # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info) # Print Meta Info self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info)
class ReutersImporter(ImporterBase): def count_docs_per_class(self): counts = {} for file in reuters.fileids(): categories = reuters.categories(file) if len(categories) == 1: classname = categories[0] try: counts[classname] += 1 except KeyError: counts[classname] = 1 return counts def filter_classes(self, counts): newcounts = {} for key, val in counts.items(): if val >= self.info['data_info']["min_docs_per_class"]: newcounts[key] = val return newcounts def load_valid_classes(self): counts = self.count_docs_per_class() counts = self.filter_classes(counts) self.valid_classes = list(counts) def load_documents(self): for file in ProgressIterator(reuters.fileids(), doc_progress_label): categories = reuters.categories(file) if len(categories) > 1: continue classname = categories[0] if not classname in self.valid_classes: continue class_id = self.classinfo.increase_class_count(classname) text = " ".join(reuters.words(file)) text = re.sub("(\d+) \. (\d+)", r"\1.\2", text) text = re.sub("(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(" \.", ".", text) text = re.sub(" \.", ".", text) text = re.sub(" \,", ",", text) text = re.sub(" \)", ")", text) text = re.sub("\( ", "(", text) text = re.sub(" \\' ", "'", text) self.docinfo.add_document(text, class_id) def run(self): # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html) nltk_path.append(join(config.paths["rawdata"], "nltk")) try: # Check which classes are valid depending on min_docs_per_class nbprint('Loading classes') self.load_valid_classes() # Load the documents with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) except (LookupError, FileNotFoundError): raise ImporterError( info, 'Directory "{}" does not contain the required corpus.'.format( nltk_path)) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)