Exemple #1
0
 def sync_tree_info(self, options=None):
     if self._synced_tree_info:
         logging.debug("skipping sync_tree_info, already done")
         return
     if options is None:
         options = self._default_sync_options
     if 'refresh_dest_meta' in options:
         do_full_refresh = options['refresh_dest_meta']
     if do_full_refresh:
         start = u.timestamp_now()
         logging.info("starting s3 metadata sync")
         if not options:
             options = {}
         self.get_tree_info()
         self.write_tree_info()
         elapsed = u.timestamp_now() - start
         logging.info("S3.sync_tree_info finished in %f seconds" % elapsed)
         self._synced_tree_info = True
     else:
         # read local S3 tree info (much faster)
         start = u.timestamp_now()
         if not options:
             options = {}
         self.read_tree_info()
         elapsed = u.timestamp_now() - start
         logging.info(
             "S3 tree metadata loaded from local file in %f seconds" %
             elapsed)
         self._synced_tree_info = True
Exemple #2
0
def test_age_file():
    test = '/home/floeuser/test_age_file'
    dest = test + '/' + 'aged'
    u.ensure_path(dest)
    src = test + '/' + 'foo.bar'
    with open(src, 'w') as fh:
        fh.write('foo.bar')
    for i in range(1, 510):
        start = u.timestamp_now()
        u.age_file(src, dest)
        print("iter %i took %f seconds" % (i, u.timestamp_now() - start))
Exemple #3
0
def file_dest_speed_test(dest_mgr, size):
    name = "tempfile.text"
    with open(name, "wb") as fh:
        fh.write(b"\0" * size)

    start = u.timestamp_now()
    dest_mgr.upload_finfo(".", name, name)
    elapsed = u.timestamp_now() - start
    rate = size / elapsed
    logging.debug("FileDest object uploaded, key = '%s', %i bytes, %f sec, %f bytes/sec" %
                  (name, size, elapsed, rate))
Exemple #4
0
    def speed_test(self, size):
        name = "tempfile.text"
        with open(name, "wb") as fh:
            # py2 fh.write("\0" * size)
            fh.write(b"\0" * size)

        start = u.timestamp_now()
        self.upload(".", name, name)
        elapsed = u.timestamp_now() - start
        rate = size / elapsed
        logging.debug(
            "S3 object uploaded, key = '%s', %i bytes, %f sec, %f bytes/sec" %
            (name, size, elapsed, rate))
Exemple #5
0
 def process(self, do_clear_info=True):
     logging.info("starting tree processing")
     start = u.timestamp_now()
     u.ensure_path(self.config.output)
     self._root_files_processed = 0
     if self.clear_first and do_clear_info:
         u.clear_folder(self.config.output)
     # do this at start in case last run didn't clean up properly
     self.remove_unpacked_files()
     if do_clear_info:
         self.file_info.clear()
     self._pass = 0  # pass number
     self._files_processed = 0
     # make one pass over the input files. if you need to know whether this is
     # the input pass, check for self._pass == 0.
     self._walk_files(self.config.input)
     if self.config.signalled():
         logging.info("signal set, leaving tp.process")
         return False
     # then make passes over the output files until no new files are encountered
     work_done = self._files_processed > 0
     Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_0')
     # do NOT look at _root_files_processed after pass 0 - we want to fully
     # process any files created during pass 0
     while self._pass < self.PASSES:
         self._files_processed = 0
         self._pass += 1
         self._walk_files(self.config.output)
         if self.config.signalled():
             logging.info("signal set, leaving tp.process after pass %i" % self._pass)
             work_done = False
             break
         Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_%i' % self._pass)
         if self._files_processed > 0:
             work_done = True
         else:
             break
     if self._pass >= self.PASSES:
         raise Exception("completed %i passes and still not done. failing" % self.PASSES)
     self.update_input_mgr_metadata()
     elapsed = u.timestamp_now() - start
     Config.log("tp completed in %i passes, %f seconds, work_done %s" % (self._pass, elapsed, work_done), tag='WORK_DONE')
     return work_done
Exemple #6
0
    def update_products(self):
        logging.info("starting FTP update from %s" % self._label)
        start = u.timestamp_now()
        self._download_count = 0
        self._ftp = ftplib.FTP(self._site, self._login, self._password)
        self._ftp.cwd(self._folder)

        # catchup mode means get the file names and metadata for the FTP folder,
        # and write it to _ftp_dirlist.txt, but don't copy any files. then, on the
        # next normal run, there will be no work to do unless you modify _ftp_dirlist.txt.
        # this allows forcing only specific files to be downloaded.
        if self.config.do('ftp_catchup'):
            cur_ftp_files = []
            self._local_files.clear()
            self._ftp.retrlines('LIST', cur_ftp_files.append)
            for entry in cur_ftp_files:
                finfo = u.ftp_metadata(entry)
                if finfo['isdir']:
                    continue
                finfo['modified'] = self.ftp_modified(finfo['name'])
                finfo['path'] = self._dest_root
                finfo['full'] = self._dest_root + '/' + finfo['name']
                self._local_files[finfo['name']] = finfo
            self.write_metadata()
            return 0

        # remove or modify persisted file metadata if full path matches passed regex
        if self.config.do('ftp_remove') or self.config.do('im_rerun'):
            regex = re.compile(self.config.special_mode_args[1])

            def test(fi):
                return re.search(regex, fi['full'])

            self.read_metadata(test=test,
                               action=self.config.special_mode_args[0])
            self.write_metadata()
            return 0

        # build metadata file from what's in input dir
        if self.config.do('im_meta_from_local'):
            self.metadata_from_local(clear_first=True)
            return 0

        # normal operation:
        self.read_metadata()
        self.metadata_from_local(clear_first=False)
        cur_ftp_files = []
        self._ftp.retrlines('LIST', cur_ftp_files.append)
        for entry in cur_ftp_files:
            if self.config.signalled():
                logging.info("signal set, leaving ftp.update_products")
                break
            finfo = u.ftp_metadata(entry)
            file_name = finfo['name']
            if finfo['isdir']:
                continue
            # test include/exclude rules
            found = False
            for reg in self._re_include:
                if re.search(reg, file_name):
                    found = True
                    break
            if not found:
                Config.log("skipping file '%s'" % file_name,
                           tag='FTP_INCLUDE_FILES')
                continue
            found = False
            for reg in self._re_exclude:
                if re.search(reg, file_name):
                    found = True
                    break
            if found:
                Config.log("skipping file '%s'" % file_name,
                           tag='FTP_EXCLUDE_FILES')
                continue
            finfo['modified'] = self.ftp_modified(finfo['name'])
            local_full = self._dest_root + '/' + file_name
            grabit = True
            if file_name in self._local_files:
                local_finfo = self._local_files[file_name]
                # download if ftp version is newer than our last download
                localmod = local_finfo['modified']
                remotemod = finfo['modified']
                grabit = remotemod > localmod
            if grabit:
                logging.info("grabbing new/changed file %s" % file_name)
                fh = open(local_full, "wb")
                self._ftp.retrbinary('RETR ' + file_name, fh.write)
                fh.close()
                finfo['path'] = self._dest_root
                finfo['full'] = local_full
                self._local_files[file_name] = finfo
                self._download_count += 1
                msg = "FTP downloaded file %i of limit %i" % (
                    self._download_count, self._download_limit)
                logging.info(msg)
                if self._download_limit > 0 and self._download_count >= self._download_limit:
                    msg = "downloaded limit of %i files, ending download phase" % self._download_count
                    self.config.add_to_final_summary(msg)
                    logging.warning(msg)
                    break
        self._ftp.quit()
        self._ftp = None
        self.write_metadata()
        elapsed = u.timestamp_now() - start
        logging.info(
            "FTP.update_products finished in %f seconds, downloaded %i files" %
            (elapsed, self._download_count))
        return self._download_count
Exemple #7
0
    def sync_to_upstream_dest_mgr(self, upstream, refresh_me, refresh_upstream, tmp_folder, fixer=None):
        msg = "refresh_me = %s, refresh_upstream = %s, upstream root = '%s', my root = '%s', tmp_folder = %s" % \
              (refresh_me, refresh_upstream, upstream._file_dest_root, self._file_dest_root, tmp_folder)
        Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_STARTING')
        do_refresh = {'refresh_dest_meta': True}
        dont_refresh = {'refresh_dest_meta': False}
        smart_refresh = {'refresh_dest_meta': True, 'skip_refresh_if_tree_unchanged': True}
        if refresh_me == 'full':
            self.sync_tree_info(options=do_refresh)
        elif refresh_me == 'smart':
            self.sync_tree_info(options=smart_refresh)
        else:
            self.sync_tree_info(options=dont_refresh)
        if refresh_upstream == 'full':
            upstream.sync_tree_info(options=do_refresh)
        elif refresh_upstream == 'smart':
            upstream.sync_tree_info(options=smart_refresh)
        else:
            upstream.sync_tree_info(options=dont_refresh)

        u.clear_folder(tmp_folder)
        start = u.timestamp_now()
        for key, finfo in upstream.tree_info_items():
            src = os.path.join(upstream._file_dest_root, key)
            if not os.path.exists(src):
                msg = "file '%s' does not exist" % src
                Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_METADATA ERROR')
                continue
            if self.config.is_template_type(finfo['name']) and fixer:
                # copy and fix up
                dest = os.path.join(tmp_folder, key)
                u.ensure_path_for_file(dest)
                shutil.copyfile(src, dest)
                fixer(dest)
                # make new metadata and copy to self
                newfi = copy.deepcopy(finfo)
                path, file = os.path.split(dest)
                local_meta = u.local_metadata(path, file)
                newfi['size'] = local_meta['size']
                newfi['modified'] = local_meta['modified']
                newfi['md5'] = u.md5(dest)
                self.tree_info[key] = newfi
                src = dest
                dest = os.path.join(self._file_dest_root, key)
                u.ensure_path_for_file(dest)
                shutil.copyfile(src, dest)
                # we could remove fixed-up file now, but clear_folder at end probably faster
            else:
                # file not subject to fixup. just copy if missing/older/diff size
                copyit = False
                if key in self.tree_info:
                    # compare metadata and see whether to copy
                    myfi = self.tree_info[key]
                    if myfi['md5'] != finfo['md5'] or \
                            myfi['modified'] < finfo['modified'] or \
                            myfi['size'] != finfo['size']:
                        copyit = True
                else:
                    copyit = True
                if copyit:
                    # REVIEW - deepcopy probably safe here because we're copying from
                    # one dest mgr to another
                    self.tree_info[key] = copy.deepcopy(finfo)
                    dest = os.path.join(self._file_dest_root, key)
                    u.ensure_path_for_file(dest)
                    shutil.copyfile(src, dest)

        # delete from me if not in upstream
        to_delete = {}
        for key, finfo in self.tree_info_items():
            if key not in upstream.tree_info:
                to_delete[key] = os.path.join(self._file_dest_root, finfo['key'])
        for key, full in to_delete.items():
            os.remove(full)
            del self.tree_info[key]

        self.write_tree_info()

        # this is a space-saving move, but should be small, and might
        # be handy to have files around for debug. could be a config option.
        # u.clear_folder(tmp_folder)

        elapsed = u.timestamp_now() - start
        msg = "done, elapsed %f seconds" % elapsed
        Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_FINISHED')
Exemple #8
0
    def sync_tree_info(self, options=None):
        if self._synced_tree_info:
            Config.log(self.config_section, tag='FILE_DEST_META_ALREADY_SYNCED')
            return
        start = u.timestamp_now()
        logging.info("starting FileDest metadata sync")
        if options is None:
            options = self._default_sync_options
        # need to read persisted file, as that's the only place non-file-system metadata can live
        self.read_tree_info()
        # determine whether to force full refresh, only do it if tree has changed, on simply trust metadata:
        do_full_refresh = False
        if 'refresh_dest_meta' in options:
            do_full_refresh = options['refresh_dest_meta']
        # computing all those md5s takes a long time, so optionally skip it if the most
        # recent modification time for _file_dest_root is unchanged since we last did it
        if 'skip_refresh_if_tree_unchanged' in options:
            last_mod = u.dir_last_modified(self._file_dest_root)
            expected_last_mod = self._tree_last_modified
            do_full_refresh = last_mod != expected_last_mod
            msg = "last_mod do_full_refresh = '%s', last_mod = '%f', expected_last_mod = '%f'" % (
                do_full_refresh, last_mod, expected_last_mod)
            Config.log(msg, tag='FILE_DEST_META_TREE_UNCHANGED_TEST')

        if do_full_refresh:
            # physically walk the tree as it might not match persisted data
            for dir_name, subdirs, files in os.walk(self._file_dest_root):
                for file_name in files:
                    full_src = dir_name + '/' + file_name
                    setit = False
                    rel_path = u.make_rel_path(self._file_dest_root, dir_name, strict=False, no_leading_slash=True)
                    key = u.make_key(rel_path, file_name)
                    local_meta = u.local_metadata(dir_name, file_name)
                    local_meta['md5'] = u.md5(full_src)
                    if key in self.tree_info:
                        saved_meta = self.tree_info[key]
                        if 'md5' not in saved_meta:
                            saved_meta['md5'] = 'ERROR! md5 MISSING FROM tree_info!'
                        if local_meta['md5'] == saved_meta['md5']:
                            # sanity check
                            if local_meta['size'] != saved_meta['size']:
                                msg = "key '%s', saved: size %i, read: size %i" % (
                                    key, saved_meta['size'], local_meta['size'])
                                Config.log(msg, tag='FILE_DEST_META_ERROR_NONFATAL')
                            # otherwise file is perfect, continue
                        else:
                            msg = "key '%s', md5 mismatch. saved: '%s', read: '%s'" % (
                                    key, saved_meta['md5'], local_meta['md5'])
                            Config.log(msg, tag='FILE_DEST_META_ERROR_FATAL')
                            setit = True
                    else:
                        msg = "key '%s' not found in saved, adding" % key
                        Config.log(msg, tag='FILE_DEST_META_NEW_FILE')
                        setit = True

                    if setit:
                        local_meta['key'] = key
                        self.tree_info[key] = local_meta
                        # important: must never expose 'full' outside this class - it's a private
                        # implementation detail. Same for 'path'. Only 'key' is public
                        del local_meta['full']
                        del local_meta['path']
                        self.tree_info[key] = local_meta
                    self.tree_info[key]['_found_file_'] = True

            missing = []
            for key in self.tree_info:
                if '_found_file_' in self.tree_info[key]:
                    del self.tree_info[key]['_found_file_']
                else:
                    missing.append(key)
                    msg = "no file matching key '%s', deleting" % key
                    Config.log(msg, tag='FILE_DEST_META_MISSING')
            for key in missing:
                del self.tree_info[key]

            self.write_tree_info()
            act = "completed"
        else:
            # trust the persisted file (faster)
            act = "bypassed"
        elapsed = u.timestamp_now() - start
        msg = "%s confirmation of tree info in %f seconds" % (act, elapsed)
        Config.log(msg, tag='FILE_DEST_SYNC')
        self._synced_tree_info = True
Exemple #9
0
    def upload_tree(self, local_root, remote_root='', options=None, local_tree_meta=None):
        logging.info("starting FileDest upload")
        if not options:
            options = {'use_md5': True}
        start = u.timestamp_now()
        self._upload_count = 0
        # refresh and save data for files already on dest
        self.sync_tree_info(options=self._default_sync_options)

        for dir_name, subdirs, files in os.walk(local_root):
            rel_path = u.make_rel_path(local_root, dir_name)
            if not rel_path.startswith('/tmp'):
                for file_name in files:
                    local_file = dir_name + '/' + file_name
                    key = u.make_key(rel_path, file_name)
                    local_md5 = u.md5(local_file)
                    local_meta = None
                    if local_tree_meta and local_file in local_tree_meta:
                        local_meta = local_tree_meta[local_file]
                    else:
                        local_meta = u.local_metadata(dir_name, file_name)
                    size = local_meta['size']
                    cached_info = None
                    if key in self.tree_info:
                        cached_info = self.tree_info[key]
                    do_upload = True
                    if 'use_md5' in options:
                        if cached_info and not self.is_pending(key):
                            if 'md5' in self.tree_info[key]:
                                remote_md5 = self.tree_info[key]['md5']
                                do_upload = do_upload and remote_md5 != local_md5
                            else:
                                err = "no md5 value for existing key '%s' (old version?)" % key
                                logging.error(err)
                        else:
                            Config.log("file '%s' is not in FileDest" % key, tag='DEST_NO_EXISTING_FILE')
                        if self._max_upload_size >= 0 and size > self._max_upload_size:
                            logging.debug("file '%s' size (%i) > limit (%i), won't upload" %
                                          (key, size, self._max_upload_size))
                            do_upload = False
                    if do_upload:
                        extra_args = {
                            'Metadata': {'md5': local_md5}
                        }
                        logging.debug("FileDest object upload starting, key = '%s', %i bytes" %
                                      (key, size))
                        start = u.timestamp_now()
                        self._upload(dir_name, file_name, key, extra_args=extra_args)
                        rate = size / (u.timestamp_now() - start)
                        Config.log("key = '%s', %f bytes/sec" % (key, rate), tag='FILE_DEST_UPLOAD_OK')
                        # add metadata to our repos
                        info = {
                            'new': True,
                            'name': file_name,
                            'rel_path': rel_path,
                            'key': key,
                            'size': local_meta['size'],
                            'modified': local_meta['modified'],
                            # 'mod_dt': last_mod,
                            # 'e_tag': obj.e_tag,
                            'md5': local_md5
                        }
                        # transfer meta (e.g. thumbnail info) if exists
                        if local_tree_meta and local_file in local_tree_meta:
                            self.transfer_metadata(local_tree_meta[local_file], local_root=self.local_root, dest=info)

                        self.tree_info[key] = info
                        self._upload_count += 1
                    else:
                        Config.log("key = '%s'" % key, tag='FILE_DEST_UPLOAD_NO_CHANGE')
        self.write_tree_info()
        elapsed = u.timestamp_now() - start
        logging.info("FileDest.upload_tree finished in %f seconds, uploaded %i files" %
                     (elapsed, self._upload_count))
        return self._upload_count
from util import Timer, timestamp_now

INPUT_PATH =  TREEBANK_DATA_PATH #'../treebank_data/00/ann_0001.parse'#
OUTPUT_PATH = '../reports/' # Must be directory; Filename auto-generated

def timestamped_file_path(filename, timestamp):
    return normpath(join(
        OUTPUT_PATH,
        timestamp,
        filename
    ))
    
###############################################################################
if __name__ == '__main__':
    timer = Timer()
    nowstamp = timestamp_now()

    mkdir(normpath(join(OUTPUT_PATH, nowstamp)))
    
    csvpath = timestamped_file_path('verbs.csv', nowstamp)
    pd_report_path = timestamped_file_path('pro-drop report.txt', nowstamp)
    npd_report_path = timestamped_file_path('non-pro-drop report.txt', nowstamp)
    
    with timer:
        ca = CombinedAnalyzer(INPUT_PATH)
        ca.do_analysis()
        ca.print_report_basic()

        with open(pd_report_path, 'w', encoding='utf8') as pdout:
            with open(npd_report_path, 'w', encoding='utf8') as npdout:
                ca.write_report_full(pdout, npdout)