Exemple #1
0
 def _close_index_files(self):
     fhs = [self.fh_ids, self.fh_files, self.fh_dates
            ] + self.fh_processed.values()
     for fh in fhs:
         fname = fh.name
         fh.close()
         read_only(fname)
Exemple #2
0
 def _create_general_config_file(self):
     filename = os.path.join(self.conf_path, FNAME_INFO_GENERAL)
     print "[--init] creating %s" % filename
     fh = open(filename, 'w')
     fh.write(self.command)
     fh.write("".join(self.settings))
     read_only(filename)
Exemple #3
0
 def create_skeleton_on_disk(self, repotype):
     """Initialize directory structure and files on disk. The only file with
     content is type.txt which stores the repository type."""
     for d in (self.doc_dir, self.log_dir, self.idx_dir, self.data_dir,
               self.proc_dir):
         os.makedirs(d)
     with open(self.type_file, 'w') as fh:
         fh.write("%s\n" % repotype)
     read_only(self.type_file)
     for fname in self._index_files():
         open(fname, 'w').close()
         read_only(fname)
Exemple #4
0
def add_info_file(corpus_dir, extra_files, added):
    """Append information to CORPUS/config/additions.txt."""
    info_file = os.path.join(corpus_dir, 'config', corpus.FNAME_INFO_ADDITIONS)
    make_writable(info_file)
    fh = open(info_file, 'a')
    fh.write("$ %s\n\n" % ' '.join(sys.argv))
    fh.write("timestamp    =  %s\n" % time.strftime("%x %X"))
    fh.write("file_list    =  %s\n" % extra_files)
    fh.write("files_added  =  %s\n" % added)
    fh.write("git_commit   =  %s\n\n\n" % get_git_commit())
    fh.close()
    read_only(info_file)
Exemple #5
0
 def _create_filelist(self):
     """Create a list of files either by copying a given list or by traversing a
     given directory."""
     print "[--init] creating %s" % self.file_list
     if self.source_file is not None:
         shutil.copyfile(self.source_file, self.file_list)
     elif self.source_path is not None:
         filenames = get_file_paths(self.source_path)
         if self.shuffle_file:
             random.shuffle(filenames)
         with open(self.file_list, 'w') as fh:
             for fname in filenames:
                 fh.write("0000\t" + fname + "\n")
     else:
         sys.exit("[--init] ERROR: " +
                  "need to define input with --filelist or " +
                  "--source-directory option, aborting")
     read_only(self.file_list)
Exemple #6
0
def add_files_to_corpus(corpus_dir, extra_files):
    """Append lines in extra_files to files.txt in the corpus. First create a
    time-stamped backup of files.txt. Do not add files that already are in
    files.txt."""
    if not os.path.isdir(corpus_dir):
        exit("WARNING: there is no corpus at %s" % corpus_dir)
    fname_current = os.path.join(corpus_dir, 'config', corpus.FNAME_FILELIST)
    fname_current_bak = "%s-%s.txt" % (fname_current[:-4],
                                       time.strftime("%Y%m%d:%H%M%S"))
    make_writable(fname_current)
    shutil.copyfile(fname_current, fname_current_bak)
    current_files = read_files(fname_current)
    fh_current = open(fname_current, 'a')
    added = 0
    for line in open(extra_files):
        fname = line.strip().split("\t")[1]
        if fname not in current_files:
            added += 1
            print "adding", fname
            fh_current.write(line)
    fh_current.close()
    read_only(fname_current)
    add_info_file(corpus_dir, extra_files, added)
Exemple #7
0
 def _close_log(self):
     logname = self.log.name
     self.log.close()
     read_only(logname)
Exemple #8
0
 def _create_default_pipeline_config_file(self):
     filename = os.path.join(self.conf_path, FNAME_PIPELINE_DEFAULT)
     print "[--init] creating %s" % filename
     fh = open(filename, 'w')
     fh.write(self.pipeline_config.lstrip())
     read_only(filename)