def run(self): """ Execute the clean operation using specified parameters. @todo this needs to be cleaned up and simplified """ with Timer.Timer() as t: # create an index of file hashes, so that we can track what has changed if self.update: self.hashIndex = Utils.loadFileHashIndex(self.output) # clear output folder if not os.path.exists(self.output): os.makedirs(self.output) if not self.update: Utils.cleanOutputFolder(self.output) # check state assert os.path.exists(self.source), self.log.error("Source path does not exist: " + self.source) assert os.path.exists(self.output), self.log.error("Output path does not exist: " + self.output) # clean data records = self.clean() # remove records from the index that were deleted in the source if self.update: self.log.info("Clearing orphaned records from the file hash index") Utils.purgeIndex(records, self.hashIndex) # remove files from the output that are not in the index if self.update: self.log.info("Clearing orphaned files from the output folder") Utils.purgeFolder(self.output, self.hashIndex) # write the updated file hash index Utils.writeFileHashIndex(self.hashIndex, self.output) # log execution time self.log.info("Cleaner finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds)) print(("Cleaner finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds)))
def run(self): """ Execute transformations on source documents as specified. Write results to the output path. """ with Timer.Timer() as t: # create output folder if not os.path.exists(self.output): os.makedirs(self.output) #TODO: Use the output flag instead. if 'clear' in self.actions: Utils.cleanOutputFolder(self.output) assert os.path.exists(self.output), self.log.error("Output path does not exist: {0}".format(self.output)) # execute processing actions if "digitalobjects-to-sid" in self.actions: self.transformDigitalObjectsToSID(self.sources, self.output) if "eaccpf-to-sid" in self.actions: transform = Utils.loadTransform(self.xslt) self.transformEacCpfsToSID(self.sources, self.output, transform) if "html-to-sid" in self.actions: self.transformHtmlsToSid(self.sources, self.output) if 'merge-digitalobjects' in self.actions: self.mergeDigitalObjectsIntoSID(self.sources, self.output) if "merge-inferred" in self.actions: self.mergeInferredRecordsIntoSID(self.sources, self.output) if "set-fields" in self.actions and not '' in self.set_fields: self.setFieldValue(self.output) if 'boost' in self.actions: self.setBoosts(self.output) if "validate" in self.actions: pass # log execution time self.log.info("Transformer finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
def run(self): """ Execute analysis operations using specified parameters. """ # make output folder Utils.cleanOutputFolder(self.output) # check state assert os.path.exists(self.source), self.log.error("Source path does not exist: " + self.source) assert os.path.exists(self.output), self.log.error("Output path does not exist: " + self.output) # execute actions self.graph_entities() # generate a PDF of the graph self.save_graph_as_pdf() # write graph file self.save_graph_as_gexf()
def run(self): """ Execute crawl operation. """ with Timer.Timer() as t: # check state before starting assert os.path.exists(self.source), self.log.error("Input path does not exist: {0}".format(self.source)) if not os.path.exists(self.output): os.makedirs(self.output) Utils.cleanOutputFolder(self.output, Update=self.update) assert os.path.exists(self.output), self.log.error("Output path does not exist: {0}".format(self.output)) # purge the image cache if not self.update: self.cache.purge() # create an index of files hashes so that we can track which files # have changed since the last run self.records = [] if self.update: self.hashIndex = Utils.loadFileHashIndex(self.output) # crawl the document source if 'http://' in self.source or 'https://' in self.source: self.crawlWebSite() else: self.crawlFileSystem() # if the crawl was executed as an update, then synchronize the file # index, metadata cache, and image cache folders with the source if self.update: # remove records from the index that were deleted in the source self.log.info("Clearing orphaned records from the file hash index") Utils.purgeIndex(self.records, self.hashIndex) # remove files from the metadata cache that are not in the index self.log.info("Clearing orphaned files from the output folder") Utils.purgeFolder(self.output, self.hashIndex) # remove files from the image cache that are not in the index self.log.info("Clearing orphaned files from the image cache") self.cache.purge(list(self.hashIndex.keys())) # write the updated file index Utils.writeFileHashIndex(self.hashIndex, self.output) # log execution time self.log.info("Crawler finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))