def build(self, outfile_name, infile_names, changed, context): """Download .index and .chunk files from prod. CompilePOFile takes a long time to compute. So when not on jenkins we call this rule instead to fetch from prod what is there. """ if self._locale_paths is None: self._init_locale_paths() log.v2("Determining latest prod translation files for %s" % context['{lang}']) locale = context['{lang}'] locale_path = 'gs://ka_translations/%s/' % locale if locale_path not in self.locale_paths: raise NoSuchLocaleCompileFailure(locale) try: stdout = self.call_with_output(['gsutil', 'ls', locale_path]) except compile_rule.CompileFailure, e: # TODO(james): make sure we download gcloud and gsutil as part # of the khan-dotfiles setup. raise compile_rule.CompileFailure( "%s.\nFailed to download translations from gcs. Make sure " "that you have gsutil installed via gcloud." % e)
def _read_pofile(filename): """Read from filename, a pickled polib.POFile, and return it.""" log.v2('Reading from %s', filename) try: with open(filename) as f: return cPickle.load(f) except (IOError, OSError): return None
def _download_from_s3(gitbigfile_module, outfile_abspath, sha): s3_fetcher = gitbigfile_module.GitBigfile().transport() log.v2('Downloading s3://%s/%s to %s' % ( s3_fetcher.bucket.name, sha, outfile_abspath + '.tmp')) s3_fetcher.get(sha, outfile_abspath + '.tmp') # Make sure we don't create the 'real' file until it's fully # downloaded. try: os.unlink(outfile_abspath) except (IOError, OSError): pass # probably "file not found" try: os.rename(outfile_abspath + '.tmp', outfile_abspath) except OSError: log.v1('Error fetching %s' % outfile_abspath) raise
def _write_pofile(po_entries, filename, write_debug_file_to=None): """Write a polib.POFile to filename. The po-file format is nicely human-readable, but slow to parse. The mo-file format is faster to parse, but loses important information. So we introduce a *third* format: pickled polib.POFile. Whenever we save a pofile to disk, we save a pickled form of the python data structure (polib.POFile). We also normalize the po-entries before writing the file, to minimize diffs. Arguments: po_entries: a list of of POEntry objects. filename: an absolute path to write the pofile to. write_debug_file_to: if not None, a filename to write the po_entries as a (human-readable) po-file, rather than a po.pickle file. """ from intl import polib_util output_pot = polib_util.pofile() output_pot.extend(po_entries) # sort the po-entries in a canonical order, to make diff-ing # easier, but that tries to keep content close together in the # file if it's close together in real life. We sort by first # occurrence (alphabetically), which is good for most content, # but not for datastore entities, which all have the same # occurrence (_DATASTORE_FILE:1). For them, we sort by first # url-they-appear-in. For entities that match on all of these # things, we depend on the fact python's sorts are stable to # keep them in input order (that is, the order that we extracted # them from the input ifle). url_re = re.compile('<http[^>]*>') output_pot.sort(key=lambda e: (e.occurrences[0][0], int(e.occurrences[0][ 1]), sorted(url_re.findall(e.comment))[:1])) log.v2('Writing to %s', filename) with open(filename, 'w') as f: cPickle.dump(output_pot, f, protocol=cPickle.HIGHEST_PROTOCOL) if write_debug_file_to: log.v2('Also writing to %s', write_debug_file_to) with open(write_debug_file_to, 'w') as f: polib_util.write_pofile(output_pot, f) log.v3('Done!')
class DownloadIndex(compile_rule.CompileBase): def __init__(self): super(DownloadIndex, self).__init__() self._locale_paths = None def version(self): """Update every time build() changes in a way that affects output.""" import datetime # Force redownloading once a month. return datetime.datetime.now().strftime("%Y-%m") def build(self, outfile_name, infile_names, changed, context): """Download .index and .chunk files from prod. CompilePOFile takes a long time to compute. So when not on jenkins we call this rule instead to fetch from prod what is there. """ if self._locale_paths is None: self._init_locale_paths() log.v2("Determining latest prod translation files for %s" % context['{lang}']) locale = context['{lang}'] locale_path = 'gs://ka_translations/%s/' % locale if locale_path not in self.locale_paths: raise NoSuchLocaleCompileFailure(locale) try: stdout = self.call_with_output(['gsutil', 'ls', locale_path]) except compile_rule.CompileFailure, e: # TODO(james): make sure we download gcloud and gsutil as part # of the khan-dotfiles setup. raise compile_rule.CompileFailure( "%s.\nFailed to download translations from gcs. Make sure " "that you have gsutil installed via gcloud." % e) dirs = stdout.split() if dirs: most_recent_dir = dirs[-1] log.v2("Downloading latest prod files from %s" % most_recent_dir) self.call( ['gsutil', '-m', 'cp', '-r', "%s*" % most_recent_dir, os.path.dirname(outfile_name)]) return # No translation files found on gcs ... lets complain raise compile_rule.CompileFailure( "Failed to find translation files for %s on gcs" % context['{lang}'])
def build(self, outfile_name, infile_names, changed, context): # The infiles here are genfiles/extracted_string/foo.pot.pickle # Copy unchanged messages from the existing all.pot, if possible. po_entries = collections.OrderedDict() if outfile_name in changed or changed == infile_names: log.v1('Regenerating %s from scratch (it changed on us!)' % outfile_name) changed = infile_names # everything changed else: # Extract unchanged messages from the existing all.pot existing_all_pot = _read_pofile(self.abspath(outfile_name)) if existing_all_pot: # we found an existing file log.v2('Loading existing messages') # We don't care about deleted files: those that # existed in the last call to build() but don't exist # now. (They'll be removed from all.pot by default.) # Get rid of them from 'changed' so they don't gum up # the code below. changed = [f for f in changed if f in infile_names] # Elements in infile_names and changed look like # 'genfiles/extracted_strings/en/foo.pot.pickle'. Here, # we want the version of infiles/changed that are just # 'foo'. We use the _input_map to get that mapping. orig_infiles = set(context['_input_map'][f][0] for f in infile_names) # f might not be in _input_map if it's ben deleted. orig_changed = set(context['_input_map'][f][0] for f in changed) unchanged = orig_infiles - orig_changed for entry in existing_all_pot: # Get rid of occurrences for files that no longer exist. # TODO(csilvers): get rid of comments in the same way. entry.occurrences = [ occ for occ in entry.occurrences if occ[0] in unchanged ] # If the msgid still exists at all, let's keep it! if entry.occurrences: po_entries[entry.msgid] = entry else: changed = infile_names log.v2('Extracting new and changed messages') for filename in changed: input_pot = _read_pofile(self.abspath(filename)) for poentry in input_pot: if poentry.msgid in po_entries: existing_poentry = po_entries[poentry.msgid] _merge_poentry(existing_poentry, poentry) else: po_entries[poentry.msgid] = poentry log.v2('Writing merged output') _write_pofile(po_entries.itervalues(), self.abspath(outfile_name), write_debug_file_to=self.abspath( outfile_name.replace('.pickle', '.txt_for_debugging')))
def _update_image_url_info(css_filename, image_url_info): """Given css_filenames relative to ka-root, update _IMAGE_URL_INFO. Returns: A list of image filenames, relative to ka-root, mentioned in this css-filename. """ # First, we need to delete all old references to css_filenames. for file_info in image_url_info.itervalues(): new_files = [f for f in file_info[0] if f != css_filename] if len(new_files) < len(file_info[0]): # We go through this contortion so we can edit the list in place. del file_info[0][:] file_info[0].extend(new_files) # If the file no longer exists (has been deleted), we're done! if not os.path.exists(ka_root.join(css_filename)): log.v3("removing image-url info for %s: it's been deleted", css_filename) return # Then, we need to add updated references, based on the current # file contents. log.v2('Parsing image-urls from %s', css_filename) with open(ka_root.join(css_filename)) as f: content = f.read() retval = [] for (img_url, img_relpath, img_size) in (_image_urls_and_file_info(content)): image_url_info.setdefault(img_url, ([], img_relpath, img_size)) image_url_info[img_url][0].append(css_filename) retval.append(img_relpath) log.v4('Image-url info: %s', retval) return retval
def build_many(self, outfile_infiles_changed_context): from shared.testutil import fake_datetime sha_to_files = {} # for the files we need to get from S3 for (outfile, infiles, _, context) in outfile_infiles_changed_context: assert len(infiles) == 1, infiles assert infiles[0].startswith('intl/translations/') with open(self.abspath(infiles[0])) as f: head = f.read(64).strip() # Does the head look like a sha1? (sha1's are only 40 bytes.) # If so, store it for later. If not, take care of it now. if head.strip('0123456789abcdefABCDEF') == '': sha_to_files.setdefault(head, []).append(outfile) else: # Nope, not a sha1. NOTE: We could also use a hard-link, # but that could fail if genfiles is on a different # filesystem from the source. Copying is more expensive # but safer. Symlinks are right out. shutil.copyfile(self.abspath(infiles[0]), self.abspath(outfile)) if not sha_to_files: return # We could just call 'git bigfile pull' but we purposefully # don't so as to leave untouched the file-contents in # intl/translations. This works better with kake, which # doesn't like it when input contents change as part of a kake # rule. self._munge_sys_path() # so the following import succeeds import gitbigfile.command # Download all our files from S3 in parallel. We store these # files under a 'permanent' name based on the sha1. (Later # we'll copy these files to outfile_name.) That way even if # you check out a different branch and come back to this one # again, you can get the old contents without needing to # revisit S3. # GitBigfile() (in _download_from_s3) runs 'git' commands in a # subprocess, so we need to be in the right repository for that. old_cwd = os.getcwd() os.chdir(self.abspath('intl/translations')) try: # This will actually try to download translation files via # bigfile. This requires a real datetime for making the # api requests to S3 (S3 complains about weird dates). with fake_datetime.suspend_fake_datetime(): arglists = [] for (sha, outfiles) in sha_to_files.iteritems(): # Typically a given sha will have only one outfile, # but for some shas (an empty po-file, e.g.), many # outfiles may share the same sha! log.v1('Fetching %s from S3' % ' '.join(outfiles)) # We just need to put this in a directory we know we # can write to: take one of the outfile dirs arbitrarily. sha_name = os.path.join(os.path.dirname(outfiles[0]), sha) arglists.append( (gitbigfile.command, self.abspath(sha_name), sha)) shared.util.thread.run_many_threads( self._download_from_s3, arglists) except RuntimeError as why: log.error(why) # probably misleading, but maybe helpful # TODO(csilvers): check whether git-bigfile *is* set up # correctly, and give a more precise failure message if so. raise compile_rule.CompileFailure( "Failed to download translation file for %s from S3. " "Make sure you have git-bigfile set up as per the " "configs in the khan-dotfiles repo: namely, the " "'bigfile' section in .gitconfig.khan, and the " "update_credentials() section in setup.sh." % outfile) finally: os.chdir(old_cwd) # Now copy from the sha-name to the actual output filename. for (sha, outfiles) in sha_to_files.iteritems(): sha_name = os.path.join(os.path.dirname(outfiles[0]), sha) for outfile in outfiles: log.v2('Copying from %s to %s' % (sha_name, outfile)) try: os.unlink(self.abspath(outfile)) except OSError: pass # probably file not found os.link(self.abspath(sha_name), self.abspath(outfile))