def _close_index_files(self): fhs = [self.fh_ids, self.fh_files, self.fh_dates ] + self.fh_processed.values() for fh in fhs: fname = fh.name fh.close() read_only(fname)
def _create_general_config_file(self): filename = os.path.join(self.conf_path, FNAME_INFO_GENERAL) print "[--init] creating %s" % filename fh = open(filename, 'w') fh.write(self.command) fh.write("".join(self.settings)) read_only(filename)
def create_skeleton_on_disk(self, repotype): """Initialize directory structure and files on disk. The only file with content is type.txt which stores the repository type.""" for d in (self.doc_dir, self.log_dir, self.idx_dir, self.data_dir, self.proc_dir): os.makedirs(d) with open(self.type_file, 'w') as fh: fh.write("%s\n" % repotype) read_only(self.type_file) for fname in self._index_files(): open(fname, 'w').close() read_only(fname)
def add_info_file(corpus_dir, extra_files, added): """Append information to CORPUS/config/additions.txt.""" info_file = os.path.join(corpus_dir, 'config', corpus.FNAME_INFO_ADDITIONS) make_writable(info_file) fh = open(info_file, 'a') fh.write("$ %s\n\n" % ' '.join(sys.argv)) fh.write("timestamp = %s\n" % time.strftime("%x %X")) fh.write("file_list = %s\n" % extra_files) fh.write("files_added = %s\n" % added) fh.write("git_commit = %s\n\n\n" % get_git_commit()) fh.close() read_only(info_file)
def _create_filelist(self): """Create a list of files either by copying a given list or by traversing a given directory.""" print "[--init] creating %s" % self.file_list if self.source_file is not None: shutil.copyfile(self.source_file, self.file_list) elif self.source_path is not None: filenames = get_file_paths(self.source_path) if self.shuffle_file: random.shuffle(filenames) with open(self.file_list, 'w') as fh: for fname in filenames: fh.write("0000\t" + fname + "\n") else: sys.exit("[--init] ERROR: " + "need to define input with --filelist or " + "--source-directory option, aborting") read_only(self.file_list)
def add_files_to_corpus(corpus_dir, extra_files): """Append lines in extra_files to files.txt in the corpus. First create a time-stamped backup of files.txt. Do not add files that already are in files.txt.""" if not os.path.isdir(corpus_dir): exit("WARNING: there is no corpus at %s" % corpus_dir) fname_current = os.path.join(corpus_dir, 'config', corpus.FNAME_FILELIST) fname_current_bak = "%s-%s.txt" % (fname_current[:-4], time.strftime("%Y%m%d:%H%M%S")) make_writable(fname_current) shutil.copyfile(fname_current, fname_current_bak) current_files = read_files(fname_current) fh_current = open(fname_current, 'a') added = 0 for line in open(extra_files): fname = line.strip().split("\t")[1] if fname not in current_files: added += 1 print "adding", fname fh_current.write(line) fh_current.close() read_only(fname_current) add_info_file(corpus_dir, extra_files, added)
def _close_log(self): logname = self.log.name self.log.close() read_only(logname)
def _create_default_pipeline_config_file(self): filename = os.path.join(self.conf_path, FNAME_PIPELINE_DEFAULT) print "[--init] creating %s" % filename fh = open(filename, 'w') fh.write(self.pipeline_config.lstrip()) read_only(filename)