Ejemplo n.º 1
0
 def run(self):
     """
     Execute the clean operation using specified parameters.
     @todo this needs to be cleaned up and simplified
     """
     with Timer.Timer() as t:
         # create an index of file hashes, so that we can track what has changed
         if self.update:
             self.hashIndex = Utils.loadFileHashIndex(self.output)
         # clear output folder
         if not os.path.exists(self.output):
             os.makedirs(self.output)
         if not self.update:
             Utils.cleanOutputFolder(self.output)
         # check state
         assert os.path.exists(self.source), self.log.error("Source path does not exist: " + self.source)
         assert os.path.exists(self.output), self.log.error("Output path does not exist: " + self.output)
         # clean data
         records = self.clean()
         # remove records from the index that were deleted in the source
         if self.update:
             self.log.info("Clearing orphaned records from the file hash index")
             Utils.purgeIndex(records, self.hashIndex)
         # remove files from the output that are not in the index
         if self.update:
             self.log.info("Clearing orphaned files from the output folder")
             Utils.purgeFolder(self.output, self.hashIndex)
         # write the updated file hash index
         Utils.writeFileHashIndex(self.hashIndex, self.output)
     # log execution time
     self.log.info("Cleaner finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
     print(("Cleaner finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds)))
Ejemplo n.º 2
0
 def run(self):
     """
     Execute transformations on source documents as specified. Write results 
     to the output path.
     """
     with Timer.Timer() as t:
         # create output folder
         if not os.path.exists(self.output):
             os.makedirs(self.output)
         
         #TODO: Use the output flag instead.
         if 'clear' in self.actions:
             Utils.cleanOutputFolder(self.output)
         assert os.path.exists(self.output), self.log.error("Output path does not exist: {0}".format(self.output))
         # execute processing actions
         if "digitalobjects-to-sid" in self.actions:
             self.transformDigitalObjectsToSID(self.sources, self.output)
         if "eaccpf-to-sid" in self.actions:
             transform = Utils.loadTransform(self.xslt)
             self.transformEacCpfsToSID(self.sources, self.output, transform)
         if "html-to-sid" in self.actions:
             self.transformHtmlsToSid(self.sources, self.output)
         if 'merge-digitalobjects' in self.actions:
             self.mergeDigitalObjectsIntoSID(self.sources, self.output)
         if "merge-inferred" in self.actions:
             self.mergeInferredRecordsIntoSID(self.sources, self.output)
         if "set-fields" in self.actions and not '' in self.set_fields:
             self.setFieldValue(self.output)
         if 'boost' in self.actions:
             self.setBoosts(self.output)
         if "validate" in self.actions:
             pass
             
     # log execution time
     self.log.info("Transformer finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
Ejemplo n.º 3
0
 def run(self):
     """
     Execute analysis operations using specified parameters.
     """
     # make output folder
     Utils.cleanOutputFolder(self.output)
     # check state
     assert os.path.exists(self.source), self.log.error("Source path does not exist: " + self.source)
     assert os.path.exists(self.output), self.log.error("Output path does not exist: " + self.output)
     # execute actions
     self.graph_entities()
     # generate a PDF of the graph
     self.save_graph_as_pdf()
     # write graph file
     self.save_graph_as_gexf()
Ejemplo n.º 4
0
 def run(self):
     """
     Execute crawl operation.
     """
     with Timer.Timer() as t:
         # check state before starting
         assert os.path.exists(self.source), self.log.error("Input path does not exist: {0}".format(self.source))
         if not os.path.exists(self.output):
             os.makedirs(self.output)
         Utils.cleanOutputFolder(self.output, Update=self.update)
         assert os.path.exists(self.output), self.log.error("Output path does not exist: {0}".format(self.output))
         # purge the image cache
         if not self.update:
             self.cache.purge()
         # create an index of files hashes so that we can track which files
         # have changed since the last run
         self.records = []
         if self.update:
             self.hashIndex = Utils.loadFileHashIndex(self.output)
         # crawl the document source
         if 'http://' in self.source or 'https://' in self.source:
             self.crawlWebSite()
         else:
             self.crawlFileSystem()
         # if the crawl was executed as an update, then synchronize the file
         # index, metadata cache, and image cache folders with the source
         if self.update:
             # remove records from the index that were deleted in the source
             self.log.info("Clearing orphaned records from the file hash index")
             Utils.purgeIndex(self.records, self.hashIndex)
             # remove files from the metadata cache that are not in the index
             self.log.info("Clearing orphaned files from the output folder")
             Utils.purgeFolder(self.output, self.hashIndex)
             # remove files from the image cache that are not in the index
             self.log.info("Clearing orphaned files from the image cache")
             self.cache.purge(list(self.hashIndex.keys()))
         # write the updated file index
         Utils.writeFileHashIndex(self.hashIndex, self.output)
     # log execution time
     self.log.info("Crawler finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))