def __init__(self, *args, **kwargs): """ Initialization assumes that SENTTREE_PATH environment is set. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["SENTTREE_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['SENTTREE_PATH']
def __init__(self,*args, **kwargs): """ Initialization assumes that IMDB_PATH environment is set. To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["IMDB_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['IMDB_PATH']
def __init__(self, *args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["REUTERS_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['REUTERS_PATH'] self.validationDict = {}
def __init__(self, *args, **kwargs): """ Initialization assumes that NEWSGROUP_PATH environment is set. To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["NEWSGROUP_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['NEWSGROUP_PATH'] self.validationDict = {} self.topic_names = []
def __init__(self,*args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["DUC_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['DUC_PATH'] self.processed_filenames = [] self.processed_summaries = [] self.lambda_val = os.environ['DUC_LAMBDA'] self.diversity = os.environ['DUC_DIVERSITY'] self.duc_topic = os.environ['DUC_TOPIC'] self.document_id = 0
def __readAPass(self, load=0): if load == 0: self.topic_names = self.readTopic() train_doc_ids = [] document_id = 0 for first_level_folder in os.listdir(self.folderPath): if not (DocumentReader._folderISHidden(self, first_level_folder)): for topic in self.topic_names: if topic not in [ 'talk.politics.mideast', 'comp.graphics', 'soc.religion.christian', 'rec.autos', 'sci.space', 'talk.politics.guns', 'rec.sport.baseball', 'sci.med' ]: continue for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \ first_level_folder, "/", topic)): doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \ %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_)) doc_content = self.stripDocContent(doc_content) document_id += 1 title, metadata, istrain = None, None, None try: trainortest = first_level_folder.split('-')[-1] metadata = "SPLIT:%s" % trainortest istrain = 'YES' if (trainortest.lower() == 'train') else 'NO' except: Logger.logr.info("NO MetaData or Train Test Tag") if istrain == 'YES': train_doc_ids.append(document_id) if document_id in self.validationDict: istrain = 'VALID' if load == 1: self.postgres_recorder.insertIntoDocTable(document_id, title, \ doc_content, file_, metadata) category = topic.split('.')[0] self.postgres_recorder.insertIntoDocTopTable(document_id, \ [topic], [category]) self._recordParagraphAndSentence( document_id, doc_content, self.postgres_recorder, topic, istrain) Logger.logr.info("A pass of the document reading complete.") return train_doc_ids
def readDocument(self, ld): """ """ if ld <= 0: return 0 self.postgres_recorder.trucateTables() self.postgres_recorder.alterSequences() topic_names = self.readTopic() document_id = 0 for first_level_folder in next(os.walk(self.folderPath))[1]: if not (DocumentReader._folderISHidden(self, first_level_folder)): for topic in topic_names: for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \ first_level_folder, "/", topic)): file_content = self._getTextFromFile("%s%s%s%s%s%s%s" \ %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_)) file_content = file_content.split("%s" % os.linesep) for doc_content in file_content: document_id += 1 title, metadata, istrain = None, None, None try: trainortest = first_level_folder metadata = "SPLIT:%s" % trainortest istrain = 'YES' if trainortest.lower( ) == 'train' else 'NO' except: Logger.logr.info( "NO MetaData or Train Test Tag") self.postgres_recorder.insertIntoDocTable(document_id, title, \ doc_content, file_, metadata) category = topic.split('.')[0] self.postgres_recorder.insertIntoDocTopTable(document_id, \ [topic], [category]) self._recordParagraphAndSentence( document_id, doc_content, self.postgres_recorder, topic, istrain) Logger.logr.info("Document reading complete.") return 1