class GitRevisionCollector(RevisionCollector): """Output file revisions to git-fast-import.""" def __init__(self, blob_filename, revision_reader): self.blob_filename = blob_filename self.revision_reader = revision_reader def register_artifacts(self, which_pass): self.revision_reader.register_artifacts(which_pass) def start(self): self.revision_reader.start() self.dump_file = open(self.blob_filename, 'wb') self._mark_generator = KeyGenerator() def _process_revision(self, cvs_rev): """Write the revision fulltext to a blob if it is not dead.""" if isinstance(cvs_rev, CVSRevisionDelete): # There is no need to record a delete revision, and its token # will never be needed: return # FIXME: We have to decide what to do about keyword substitution # and eol_style here: fulltext = self.revision_reader.get_content(cvs_rev) mark = self._mark_generator.gen_id() self.dump_file.write('blob\n') self.dump_file.write('mark :%d\n' % (mark, )) self.dump_file.write('data %d\n' % (len(fulltext), )) self.dump_file.write(fulltext) self.dump_file.write('\n') cvs_rev.revision_reader_token = mark def _process_symbol(self, cvs_symbol, cvs_file_items): """Record the original source of CVS_SYMBOL. Determine the original revision source of CVS_SYMBOL, and store it as the symbol's revision_reader_token.""" cvs_source = cvs_symbol.get_cvs_revision_source(cvs_file_items) cvs_symbol.revision_reader_token = cvs_source.revision_reader_token def process_file(self, cvs_file_items): for lod_items in cvs_file_items.iter_lods(): for cvs_rev in lod_items.cvs_revisions: self._process_revision(cvs_rev) # Now that all CVSRevisions' revision_reader_tokens are set, # iterate through symbols and set their tokens to those of their # original source revisions: for lod_items in cvs_file_items.iter_lods(): if lod_items.cvs_branch is not None: self._process_symbol(lod_items.cvs_branch, cvs_file_items) for cvs_tag in lod_items.cvs_tags: self._process_symbol(cvs_tag, cvs_file_items) def finish(self): self.revision_reader.finish() self.dump_file.close()
class ExternalBlobGenerator(RevisionCollector): """Have generate_blobs.py output file revisions to a blob file.""" def __init__(self, blob_filename): self.blob_filename = blob_filename def start(self): self._mark_generator = KeyGenerator() logger.normal("Starting generate_blobs.py...") self._popen = subprocess.Popen( [sys.executable, os.path.join(os.path.dirname(__file__), "generate_blobs.py"), self.blob_filename], stdin=subprocess.PIPE, ) def _process_symbol(self, cvs_symbol, cvs_file_items): """Record the original source of CVS_SYMBOL. Determine the original revision source of CVS_SYMBOL, and store it as the symbol's revision_reader_token.""" cvs_source = cvs_symbol.get_cvs_revision_source(cvs_file_items) cvs_symbol.revision_reader_token = cvs_source.revision_reader_token def process_file(self, cvs_file_items): marks = {} for lod_items in cvs_file_items.iter_lods(): for cvs_rev in lod_items.cvs_revisions: if not isinstance(cvs_rev, CVSRevisionDelete): mark = self._mark_generator.gen_id() cvs_rev.revision_reader_token = mark marks[cvs_rev.rev] = mark # A separate pickler is used for each dump(), so that its memo # doesn't grow very large. The default ASCII protocol is used so # that this works without changes on systems that distinguish # between text and binary files. pickle.dump((cvs_file_items.cvs_file.rcs_path, marks), self._popen.stdin) self._popen.stdin.flush() # Now that all CVSRevisions' revision_reader_tokens are set, # iterate through symbols and set their tokens to those of their # original source revisions: for lod_items in cvs_file_items.iter_lods(): if lod_items.cvs_branch is not None: self._process_symbol(lod_items.cvs_branch, cvs_file_items) for cvs_tag in lod_items.cvs_tags: self._process_symbol(cvs_tag, cvs_file_items) def finish(self): self._popen.stdin.close() logger.normal("Waiting for generate_blobs.py to finish...") returncode = self._popen.wait() if returncode: raise FatalError("generate_blobs.py failed with return code %s." % (returncode,)) else: logger.normal("generate_blobs.py is done.")
class MetadataLogger: """Store and generate IDs for the metadata associated with CVSRevisions. We want CVSRevisions that might be able to be combined to have the same metadata ID, so we want a one-to-one relationship id <-> metadata. We could simply construct a map {metadata : id}, but the map would grow too large. Therefore, we generate a digest containing the significant parts of the metadata, and construct a map {digest : id}. To get the ID for a new set of metadata, we first create the digest. If there is already an ID registered for that digest, we simply return it. If not, we generate a new ID, store the metadata in the metadata database under that ID, record the mapping {digest : id}, and return the new id. What metadata is included in the digest? The author, log_msg, project_id (if Ctx().cross_project_commits is not set), and branch_name (if Ctx().cross_branch_commits is not set).""" def __init__(self, metadata_db): self._metadata_db = metadata_db # A map { digest : id }: self._digest_to_id = {} # A key_generator to generate keys for metadata that haven't been # seen yet: self.key_generator = KeyGenerator() def store(self, project, branch_name, author, log_msg): """Store the metadata and return its id. Locate the record for a commit with the specified (PROJECT, BRANCH_NAME, AUTHOR, LOG_MSG) and return its id. (Depending on policy, not all of these items are necessarily used when creating the unique id.) If there is no such record, create one and return its newly-generated id.""" key = [author, log_msg] if not Ctx().cross_project_commits: key.append('%x' % project.id) if not Ctx().cross_branch_commits: key.append(branch_name or '') digest = sha.new('\0'.join(key)).digest() try: # See if it is already known: return self._digest_to_id[digest] except KeyError: id = self.key_generator.gen_id() self._digest_to_id[digest] = id self._metadata_db[id] = Metadata(id, author, log_msg) return id
class Substituter: def __init__(self, template): self.template = template self.key_generator = KeyGenerator(1) # A map from old values to new ones. self.substitutions = {} def get_substitution(self, s): r = self.substitutions.get(s) if r == None: r = self.template % self.key_generator.gen_id() self.substitutions[s] = r return r
class ExternalBlobGenerator(RevisionCollector): """Have generate_blobs.py output file revisions to a blob file.""" def __init__(self, blob_filename=None): self.blob_filename = blob_filename def register_artifacts(self, which_pass): RevisionCollector.register_artifacts(self, which_pass) if self.blob_filename is None: artifact_manager.register_temp_file( config.GIT_BLOB_DATAFILE, which_pass, ) def start(self): self._mark_generator = KeyGenerator() logger.normal('Starting generate_blobs.py...') if self.blob_filename is None: blob_filename = artifact_manager.get_temp_file(config.GIT_BLOB_DATAFILE) else: blob_filename = self.blob_filename self._pipe = subprocess.Popen( [ sys.executable, os.path.join(os.path.dirname(__file__), 'generate_blobs.py'), blob_filename, ], stdin=subprocess.PIPE, ) def _process_symbol(self, cvs_symbol, cvs_file_items): """Record the original source of CVS_SYMBOL. Determine the original revision source of CVS_SYMBOL, and store it as the symbol's revision_reader_token.""" cvs_source = cvs_symbol.get_cvs_revision_source(cvs_file_items) cvs_symbol.revision_reader_token = cvs_source.revision_reader_token def process_file(self, cvs_file_items): marks = {} for lod_items in cvs_file_items.iter_lods(): for cvs_rev in lod_items.cvs_revisions: if not isinstance(cvs_rev, CVSRevisionDelete): mark = self._mark_generator.gen_id() cvs_rev.revision_reader_token = mark marks[cvs_rev.rev] = mark if marks: # A separate pickler is used for each dump(), so that its memo # doesn't grow very large. The default ASCII protocol is used so # that this works without changes on systems that distinguish # between text and binary files. pickle.dump((cvs_file_items.cvs_file.rcs_path, marks), self._pipe.stdin) self._pipe.stdin.flush() # Now that all CVSRevisions' revision_reader_tokens are set, # iterate through symbols and set their tokens to those of their # original source revisions: for lod_items in cvs_file_items.iter_lods(): if lod_items.cvs_branch is not None: self._process_symbol(lod_items.cvs_branch, cvs_file_items) for cvs_tag in lod_items.cvs_tags: self._process_symbol(cvs_tag, cvs_file_items) def finish(self): self._pipe.stdin.close() logger.normal('Waiting for generate_blobs.py to finish...') returncode = self._pipe.wait() if returncode: raise FatalError( 'generate_blobs.py failed with return code %s.' % (returncode,) ) else: logger.normal('generate_blobs.py is done.')
class GitOutputOption(DVCSOutputOption): """An OutputOption that outputs to a git-fast-import formatted file. Members: dump_filename -- (string or None) the name of the file to which the git-fast-import commands for defining revisions will be written. If None, the data will be written to stdout. author_transforms -- a map from CVS author names to git full name and email address. See DVCSOutputOption.normalize_author_transforms() for information about the form of this parameter. """ name = "Git" # The first mark number used for git-fast-import commit marks. This # value needs to be large to avoid conflicts with blob marks. _first_commit_mark = 1000000000 def __init__( self, revision_writer, dump_filename=None, author_transforms=None, tie_tag_fixup_branches=False, ): """Constructor. REVISION_WRITER is a GitRevisionWriter that is used to output either the content of revisions or a mark that was previously used to label a blob. DUMP_FILENAME is the name of the file to which the git-fast-import commands for defining revisions should be written. (Please note that depending on the style of revision writer, the actual file contents might not be written to this file.) If it is None, then the output is written to stdout. AUTHOR_TRANSFORMS is a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents should either be Unicode strings or 8-bit strings encoded as UTF-8. TIE_TAG_FIXUP_BRANCHES means whether after finishing with a tag fixup branch, it should be psuedo-merged (ancestry linked but no content changes) back into its source branch, to dispose of the open head. """ DVCSOutputOption.__init__(self) self.dump_filename = dump_filename self.revision_writer = revision_writer self.author_transforms = self.normalize_author_transforms( author_transforms ) self.tie_tag_fixup_branches = tie_tag_fixup_branches self._mark_generator = KeyGenerator(GitOutputOption._first_commit_mark) def register_artifacts(self, which_pass): DVCSOutputOption.register_artifacts(self, which_pass) self.revision_writer.register_artifacts(which_pass) def check_symbols(self, symbol_map): # FIXME: What constraints does git impose on symbols? pass def setup(self, svn_rev_count): DVCSOutputOption.setup(self, svn_rev_count) if self.dump_filename is None: self.f = sys.stdout else: self.f = open(self.dump_filename, 'wb') # The youngest revnum that has been committed so far: self._youngest = 0 # A map {lod : [(revnum, mark)]} giving each of the revision # numbers in which there was a commit to lod, and the mark active # at the end of the revnum. self._marks = {} self.revision_writer.start(self._mirror, self.f) def _create_commit_mark(self, lod, revnum): mark = self._mark_generator.gen_id() self._set_lod_mark(lod, revnum, mark) return mark def _set_lod_mark(self, lod, revnum, mark): """Record MARK as the status of LOD for REVNUM. If there is already an entry for REVNUM, overwrite it. If not, append a new entry to the self._marks list for LOD.""" assert revnum >= self._youngest entry = (revnum, mark) try: modifications = self._marks[lod] except KeyError: # This LOD hasn't appeared before; create a new list and add the # entry: self._marks[lod] = [entry] else: # A record exists, so it necessarily has at least one element: if modifications[-1][0] == revnum: modifications[-1] = entry else: modifications.append(entry) self._youngest = revnum def _get_author(self, svn_commit): """Return the author to be used for SVN_COMMIT. Return the author as a UTF-8 string in the form needed by git fast-import; that is, 'name <email>'.""" cvs_author = svn_commit.get_author() return self._map_author(cvs_author) def _map_author(self, cvs_author): return self.author_transforms.get(cvs_author, "%s <>" % (cvs_author,)) @staticmethod def _get_log_msg(svn_commit): return svn_commit.get_log_msg() def process_initial_project_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) self._mirror.end_commit() def process_primary_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) lods = set() for cvs_rev in svn_commit.get_cvs_items(): lods.add(cvs_rev.lod) if len(lods) != 1: raise InternalError('Commit affects %d LODs' % (len(lods),)) lod = lods.pop() self._mirror.start_commit(svn_commit.revnum) if isinstance(lod, Trunk): # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') else: self.f.write('commit refs/heads/%s\n' % (lod.name,)) mark = self._create_commit_mark(lod, svn_commit.revnum) logger.normal( 'Writing commit r%d on %s (mark :%d)' % (svn_commit.revnum, lod, mark,) ) self.f.write('mark :%d\n' % (mark,)) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) for cvs_rev in svn_commit.get_cvs_items(): self.revision_writer.process_revision(cvs_rev, post_commit=False) self.f.write('\n') self._mirror.end_commit() def process_post_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) source_lods = set() for cvs_rev in svn_commit.cvs_revs: source_lods.add(cvs_rev.lod) if len(source_lods) != 1: raise InternalError('Commit is from %d LODs' % (len(source_lods),)) source_lod = source_lods.pop() self._mirror.start_commit(svn_commit.revnum) # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') mark = self._create_commit_mark(None, svn_commit.revnum) logger.normal( 'Writing post-commit r%d on Trunk (mark :%d)' % (svn_commit.revnum, mark,) ) self.f.write('mark :%d\n' % (mark,)) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) self.f.write( 'merge :%d\n' % (self._get_source_mark(source_lod, svn_commit.revnum),) ) for cvs_rev in svn_commit.cvs_revs: self.revision_writer.process_revision(cvs_rev, post_commit=True) self.f.write('\n') self._mirror.end_commit() def _get_source_mark(self, source_lod, revnum): """Return the mark active on SOURCE_LOD at the end of REVNUM.""" modifications = self._marks[source_lod] i = bisect.bisect_left(modifications, (revnum + 1,)) - 1 (revnum, mark) = modifications[i] return mark def describe_lod_to_user(self, lod): """This needs to make sense to users of the fastimported result.""" if isinstance(lod, Trunk): return 'master' else: return lod.name def _describe_commit(self, svn_commit, lod): author = self._map_author(svn_commit.get_author()) if author.endswith(" <>"): author = author[:-3] date = time.strftime( "%Y-%m-%d %H:%M:%S UTC", time.gmtime(svn_commit.date) ) log_msg = svn_commit.get_log_msg() if log_msg.find('\n') != -1: log_msg = log_msg[:log_msg.index('\n')] return "%s %s %s '%s'" % ( self.describe_lod_to_user(lod), date, author, log_msg,) def _process_symbol_commit(self, svn_commit, git_branch, source_groups): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) # There are two distinct cases we need to care for here: # 1. initial creation of a LOD # 2. fixup of an existing LOD to include more files, because the LOD in # CVS was created piecemeal over time, with intervening commits # We look at _marks here, but self._mirror._get_lod_history(lod).exists() # might be technically more correct (though _get_lod_history is currently # underscore-private) is_initial_lod_creation = svn_commit.symbol not in self._marks # Create the mark, only after the check above mark = self._create_commit_mark(svn_commit.symbol, svn_commit.revnum) if is_initial_lod_creation: # Get the primary parent p_source_revnum, p_source_lod, p_cvs_symbols = source_groups[0] try: p_source_node = self._mirror.get_old_lod_directory( p_source_lod, p_source_revnum ) except KeyError: raise InternalError('Source %r does not exist' % (p_source_lod,)) cvs_files_to_delete = set(self._get_all_files(p_source_node)) for (source_revnum, source_lod, cvs_symbols,) in source_groups: for cvs_symbol in cvs_symbols: cvs_files_to_delete.discard(cvs_symbol.cvs_file) self.f.write('commit %s\n' % (git_branch,)) self.f.write('mark :%d\n' % (mark,)) self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) # Only record actual DVCS ancestry for the primary sprout parent, # all the rest are effectively cherrypicks. if is_initial_lod_creation: self.f.write( 'from :%d\n' % (self._get_source_mark(p_source_lod, p_source_revnum),) ) for (source_revnum, source_lod, cvs_symbols,) in source_groups: for cvs_symbol in cvs_symbols: self.revision_writer.branch_file(cvs_symbol) if is_initial_lod_creation: for cvs_file in cvs_files_to_delete: self.f.write('D %s\n' % (cvs_file.cvs_path,)) self.f.write('\n') return mark def process_branch_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) source_groups = self._get_source_groups(svn_commit) if self._is_simple_copy(svn_commit, source_groups): (source_revnum, source_lod, cvs_symbols) = source_groups[0] logger.debug( '%s will be created via a simple copy from %s:r%d' % (svn_commit.symbol, source_lod, source_revnum,) ) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) self._mirror.copy_lod(source_lod, svn_commit.symbol, source_revnum) self._set_lod_mark(svn_commit.symbol, svn_commit.revnum, mark) else: logger.debug( '%s will be created via fixup commit(s)' % (svn_commit.symbol,) ) self._process_symbol_commit( svn_commit, 'refs/heads/%s' % (svn_commit.symbol.name,), source_groups, ) self._mirror.end_commit() def _set_symbol(self, symbol, mark): if isinstance(symbol, Branch): category = 'heads' elif isinstance(symbol, Tag): category = 'tags' else: raise InternalError() self.f.write('reset refs/%s/%s\n' % (category, symbol.name,)) self.f.write('from :%d\n' % (mark,)) def get_tag_fixup_branch_name(self, svn_commit): # The branch name to use for the "tag fixup branches". The # git-fast-import documentation suggests using 'TAG_FIXUP' # (outside of the refs/heads namespace), but this is currently # broken. Use a name containing '.', which is not allowed in CVS # symbols, to avoid conflicts (though of course a conflict could # still result if the user requests symbol transformations). return 'refs/heads/TAG.FIXUP' def process_tag_commit(self, svn_commit): # FIXME: For now we create a fixup branch with the same name as # the tag, then the tag. We never delete the fixup branch. self._mirror.start_commit(svn_commit.revnum) source_groups = self._get_source_groups(svn_commit) if self._is_simple_copy(svn_commit, source_groups): (source_revnum, source_lod, cvs_symbols) = source_groups[0] logger.debug( '%s will be created via a simple copy from %s:r%d' % (svn_commit.symbol, source_lod, source_revnum,) ) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) self._mirror.copy_lod(source_lod, svn_commit.symbol, source_revnum) self._set_lod_mark(svn_commit.symbol, svn_commit.revnum, mark) else: logger.debug( '%s will be created via a fixup branch' % (svn_commit.symbol,) ) fixup_branch_name = self.get_tag_fixup_branch_name(svn_commit) # Create the fixup branch (which might involve making more than # one commit): mark = self._process_symbol_commit( svn_commit, fixup_branch_name, source_groups ) # Store the mark of the last commit to the fixup branch as the # value of the tag: self._set_symbol(svn_commit.symbol, mark) self.f.write('reset %s\n' % (fixup_branch_name,)) self.f.write('\n') if self.tie_tag_fixup_branches: source_lod = source_groups[0][1] source_lod_git_branch = \ 'refs/heads/%s' % (getattr(source_lod, 'name', 'master'),) mark2 = self._create_commit_mark(source_lod, svn_commit.revnum) author = self._map_author(Ctx().username) log_msg = self._get_log_msg_for_ancestry_tie(svn_commit) self.f.write('commit %s\n' % (source_lod_git_branch,)) self.f.write('mark :%d\n' % (mark2,)) self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) self.f.write( 'merge :%d\n' % (mark,) ) self.f.write('\n') self._mirror.end_commit() def _get_log_msg_for_ancestry_tie(self, svn_commit): return Ctx().text_wrapper.fill( Ctx().tie_tag_ancestry_message % { 'symbol_name' : svn_commit.symbol.name, } ) def cleanup(self): DVCSOutputOption.cleanup(self) self.revision_writer.finish() if self.dump_filename is not None: self.f.close() del self.f
class RepositoryMirror: """Mirror a repository and its history. Mirror a repository as it is constructed, one revision at a time. For each LineOfDevelopment we store a skeleton of the directory structure within that LOD for each revnum in which it changed. For each LOD that has been seen so far, an LODHistory instance is stored in self._lod_histories. An LODHistory keeps track of each revnum in which files were added to or deleted from that LOD, as well as the node id of the root of the node tree describing the LOD contents at that revision. The LOD trees themselves are stored in the _node_db database, which maps node ids to nodes. A node is a map from CVSPath to ids of the corresponding subnodes. The _node_db is stored on disk and each access is expensive. The _node_db database only holds the nodes for old revisions. The revision that is being constructed is kept in memory in the _new_nodes map, which is cheap to access. You must invoke start_commit() before each commit and end_commit() afterwards.""" def register_artifacts(self, which_pass): """Register the artifacts that will be needed for this object.""" artifact_manager.register_temp_file( config.MIRROR_NODES_INDEX_TABLE, which_pass ) artifact_manager.register_temp_file( config.MIRROR_NODES_STORE, which_pass ) def open(self): """Set up the RepositoryMirror and prepare it for commits.""" self._key_generator = KeyGenerator() # A map from LOD to LODHistory instance for all LODs that have # been referenced so far: self._lod_histories = {} # This corresponds to the 'nodes' table in a Subversion fs. (We # don't need a 'representations' or 'strings' table because we # only track file existence, not file contents.) self._node_db = _NodeDatabase() # Start at revision 0 without a root node. self._youngest = 0 def start_commit(self, revnum): """Start a new commit.""" assert revnum > self._youngest self._youngest = revnum # A map {node_id : _WritableMirrorDirectoryMixin}. self._new_nodes = {} def end_commit(self): """Called at the end of each commit. This method copies the newly created nodes to the on-disk nodes db.""" # Copy the new nodes to the _node_db self._node_db.write_new_nodes([ node for node in self._new_nodes.values() if not isinstance(node, DeletedCurrentMirrorDirectory) ]) del self._new_nodes def _get_lod_history(self, lod): """Return the LODHistory instance describing LOD. Create a new (empty) LODHistory if it doesn't yet exist.""" try: return self._lod_histories[lod] except KeyError: lod_history = LODHistory(self, lod) self._lod_histories[lod] = lod_history return lod_history def get_old_lod_directory(self, lod, revnum): """Return the directory for the root path of LOD at revision REVNUM. Return an instance of MirrorDirectory if the path exists; otherwise, raise KeyError.""" lod_history = self._get_lod_history(lod) id = lod_history.get_id(revnum) return OldMirrorDirectory(self, id, self._node_db[id]) def get_old_path(self, cvs_path, lod, revnum): """Return the node for CVS_PATH from LOD at REVNUM. If CVS_PATH is a CVSDirectory, then return an instance of OldMirrorDirectory. If CVS_PATH is a CVSFile, return None. If CVS_PATH does not exist in the specified LOD and REVNUM, raise KeyError.""" node = self.get_old_lod_directory(lod, revnum) for sub_path in cvs_path.get_ancestry()[1:]: node = node[sub_path] return node def get_current_lod_directory(self, lod): """Return the directory for the root path of LOD in the current revision. Return an instance of CurrentMirrorDirectory. Raise KeyError if the path doesn't already exist.""" lod_history = self._get_lod_history(lod) id = lod_history.get_current_id() try: return self._new_nodes[id] except KeyError: return _CurrentMirrorReadOnlyLODDirectory( self, id, lod, self._node_db[id] ) def get_current_path(self, cvs_path, lod): """Return the node for CVS_PATH from LOD in the current revision. If CVS_PATH is a CVSDirectory, then return an instance of CurrentMirrorDirectory. If CVS_PATH is a CVSFile, return None. If CVS_PATH does not exist in the current revision of the specified LOD, raise KeyError.""" node = self.get_current_lod_directory(lod) for sub_path in cvs_path.get_ancestry()[1:]: node = node[sub_path] return node def add_lod(self, lod): """Create a new LOD in this repository. Return the CurrentMirrorDirectory that was created. If the LOD already exists, raise LODExistsError.""" lod_history = self._get_lod_history(lod) if lod_history.exists(): raise LODExistsError( 'Attempt to create %s in repository mirror when it already exists.' % (lod,) ) new_node = _CurrentMirrorWritableLODDirectory( self, self._key_generator.gen_id(), lod, {} ) lod_history.update(self._youngest, new_node.id) self._new_nodes[new_node.id] = new_node return new_node def copy_lod(self, src_lod, dest_lod, src_revnum): """Copy all of SRC_LOD at SRC_REVNUM to DST_LOD. In the youngest revision of the repository, the destination LOD *must not* already exist. Return the new node at DEST_LOD, as a CurrentMirrorDirectory.""" # Get the node of our src_path src_node = self.get_old_lod_directory(src_lod, src_revnum) dest_lod_history = self._get_lod_history(dest_lod) if dest_lod_history.exists(): raise LODExistsError( 'Attempt to copy to %s in repository mirror when it already exists.' % (dest_lod,) ) dest_lod_history.update(self._youngest, src_node.id) # Return src_node, except packaged up as a CurrentMirrorDirectory: return self.get_current_lod_directory(dest_lod) def close(self): """Free resources and close databases.""" self._lod_histories = None self._node_db.close() self._node_db = None
class GitOutputOption(DVCSOutputOption): """An OutputOption that outputs to a git-fast-import formatted file. Members: dump_filename -- (string) the name of the file to which the git-fast-import commands for defining revisions will be written. author_transforms -- a map from CVS author names to git full name and email address. See DVCSOutputOption.normalize_author_transforms() for information about the form of this parameter. """ name = "Git" # The first mark number used for git-fast-import commit marks. This # value needs to be large to avoid conflicts with blob marks. _first_commit_mark = 1000000000 def __init__( self, dump_filename, revision_writer, author_transforms=None, tie_tag_fixup_branches=False, ): """Constructor. DUMP_FILENAME is the name of the file to which the git-fast-import commands for defining revisions should be written. (Please note that depending on the style of revision writer, the actual file contents might not be written to this file.) REVISION_WRITER is a GitRevisionWriter that is used to output either the content of revisions or a mark that was previously used to label a blob. AUTHOR_TRANSFORMS is a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents should either be Unicode strings or 8-bit strings encoded as UTF-8. TIE_TAG_FIXUP_BRANCHES means whether after finishing with a tag fixup branch, it should be psuedo-merged (ancestry linked but no content changes) back into its source branch, to dispose of the open head. """ DVCSOutputOption.__init__(self) self.dump_filename = dump_filename self.revision_writer = revision_writer self.author_transforms = self.normalize_author_transforms( author_transforms ) self.tie_tag_fixup_branches = tie_tag_fixup_branches self._mark_generator = KeyGenerator(GitOutputOption._first_commit_mark) def register_artifacts(self, which_pass): DVCSOutputOption.register_artifacts(self, which_pass) self.revision_writer.register_artifacts(which_pass) def check_symbols(self, symbol_map): # FIXME: What constraints does git impose on symbols? pass def setup(self, svn_rev_count): DVCSOutputOption.setup(self, svn_rev_count) self.f = open(self.dump_filename, 'wb') # The youngest revnum that has been committed so far: self._youngest = 0 # A map {lod : [(revnum, mark)]} giving each of the revision # numbers in which there was a commit to lod, and the mark active # at the end of the revnum. self._marks = {} self.revision_writer.start(self._mirror, self.f) def _create_commit_mark(self, lod, revnum): mark = self._mark_generator.gen_id() self._set_lod_mark(lod, revnum, mark) return mark def _set_lod_mark(self, lod, revnum, mark): """Record MARK as the status of LOD for REVNUM. If there is already an entry for REVNUM, overwrite it. If not, append a new entry to the self._marks list for LOD.""" assert revnum >= self._youngest entry = (revnum, mark) try: modifications = self._marks[lod] except KeyError: # This LOD hasn't appeared before; create a new list and add the # entry: self._marks[lod] = [entry] else: # A record exists, so it necessarily has at least one element: if modifications[-1][0] == revnum: modifications[-1] = entry else: modifications.append(entry) self._youngest = revnum def _get_author(self, svn_commit): """Return the author to be used for SVN_COMMIT. Return the author as a UTF-8 string in the form needed by git fast-import; that is, 'name <email>'.""" cvs_author = svn_commit.get_author() return self._map_author(cvs_author) def _map_author(self, cvs_author): return self.author_transforms.get(cvs_author, "%s <>" % (cvs_author,)) @staticmethod def _get_log_msg(svn_commit): return svn_commit.get_log_msg() def process_initial_project_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) self._mirror.end_commit() def process_primary_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) lods = set() for cvs_rev in svn_commit.get_cvs_items(): lods.add(cvs_rev.lod) if len(lods) != 1: raise InternalError('Commit affects %d LODs' % (len(lods),)) lod = lods.pop() self._mirror.start_commit(svn_commit.revnum) if isinstance(lod, Trunk): # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') else: self.f.write('commit refs/heads/%s\n' % (lod.name,)) self.f.write( 'mark :%d\n' % (self._create_commit_mark(lod, svn_commit.revnum),) ) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) for cvs_rev in svn_commit.get_cvs_items(): self.revision_writer.process_revision(cvs_rev, post_commit=False) self.f.write('\n') self._mirror.end_commit() def process_post_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) source_lods = set() for cvs_rev in svn_commit.cvs_revs: source_lods.add(cvs_rev.lod) if len(source_lods) != 1: raise InternalError('Commit is from %d LODs' % (len(source_lods),)) source_lod = source_lods.pop() self._mirror.start_commit(svn_commit.revnum) # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') self.f.write( 'mark :%d\n' % (self._create_commit_mark(None, svn_commit.revnum),) ) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) self.f.write( 'merge :%d\n' % (self._get_source_mark(source_lod, svn_commit.revnum),) ) for cvs_rev in svn_commit.cvs_revs: self.revision_writer.process_revision(cvs_rev, post_commit=True) self.f.write('\n') self._mirror.end_commit() def _get_source_mark(self, source_lod, revnum): """Return the mark active on SOURCE_LOD at the end of REVNUM.""" modifications = self._marks[source_lod] i = bisect.bisect_left(modifications, (revnum + 1,)) - 1 (revnum, mark) = modifications[i] return mark def describe_lod_to_user(self, lod): """This needs to make sense to users of the fastimported result.""" if isinstance(lod, Trunk): return 'master' else: return lod.name def _describe_commit(self, svn_commit, lod): author = self._map_author(svn_commit.get_author()) if author.endswith(" <>"): author = author[:-3] date = time.strftime( "%Y-%m-%d %H:%M:%S UTC", time.gmtime(svn_commit.date) ) log_msg = svn_commit.get_log_msg() if log_msg.find('\n') != -1: log_msg = log_msg[:log_msg.index('\n')] return "%s %s %s '%s'" % ( self.describe_lod_to_user(lod), date, author, log_msg,) def _process_symbol_commit(self, svn_commit, git_branch, source_groups): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) # There are two distinct cases we need to care for here: # 1. initial creation of a LOD # 2. fixup of an existing LOD to include more files, because the LOD in # CVS was created piecemeal over time, with intervening commits # We look at _marks here, but self._mirror._get_lod_history(lod).exists() # might be technically more correct (though _get_lod_history is currently # underscore-private) is_initial_lod_creation = svn_commit.symbol not in self._marks # Create the mark, only after the check above mark = self._create_commit_mark(svn_commit.symbol, svn_commit.revnum) if is_initial_lod_creation: # Get the primary parent p_source_revnum, p_source_lod, p_cvs_symbols = source_groups[0] try: p_source_node = self._mirror.get_old_lod_directory( p_source_lod, p_source_revnum ) except KeyError: raise InternalError('Source %r does not exist' % (p_source_lod,)) cvs_files_to_delete = set(self._get_all_files(p_source_node)) for (source_revnum, source_lod, cvs_symbols,) in source_groups: for cvs_symbol in cvs_symbols: cvs_files_to_delete.discard(cvs_symbol.cvs_file) # Write a trailer to the log message which describes the cherrypicks that # make up this symbol creation. log_msg += "\n" if is_initial_lod_creation: log_msg += "\nSprout from %s" % ( self._describe_commit( Ctx()._persistence_manager.get_svn_commit(p_source_revnum), p_source_lod ), ) for (source_revnum, source_lod, cvs_symbols,) \ in source_groups[(is_initial_lod_creation and 1 or 0):]: log_msg += "\nCherrypick from %s:" % ( self._describe_commit( Ctx()._persistence_manager.get_svn_commit(source_revnum), source_lod ), ) for cvs_path in sorted( cvs_symbol.cvs_file.cvs_path for cvs_symbol in cvs_symbols ): log_msg += "\n %s" % (cvs_path,) if is_initial_lod_creation: if cvs_files_to_delete: log_msg += "\nDelete:" for cvs_path in sorted( cvs_file.cvs_path for cvs_file in cvs_files_to_delete ): log_msg += "\n %s" % (cvs_path,) self.f.write('commit %s\n' % (git_branch,)) self.f.write('mark :%d\n' % (mark,)) self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) # Only record actual DVCS ancestry for the primary sprout parent, # all the rest are effectively cherrypicks. if is_initial_lod_creation: self.f.write( 'from :%d\n' % (self._get_source_mark(p_source_lod, p_source_revnum),) ) for (source_revnum, source_lod, cvs_symbols,) in source_groups: for cvs_symbol in cvs_symbols: self.revision_writer.branch_file(cvs_symbol) if is_initial_lod_creation: for cvs_file in cvs_files_to_delete: self.f.write('D %s\n' % (cvs_file.cvs_path,)) self.f.write('\n') return mark def process_branch_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) source_groups = self._get_source_groups(svn_commit) if self._is_simple_copy(svn_commit, source_groups): (source_revnum, source_lod, cvs_symbols) = source_groups[0] logger.debug( '%s will be created via a simple copy from %s:r%d' % (svn_commit.symbol, source_lod, source_revnum,) ) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) self._mirror.copy_lod(source_lod, svn_commit.symbol, source_revnum) self._set_lod_mark(svn_commit.symbol, svn_commit.revnum, mark) else: logger.debug( '%s will be created via fixup commit(s)' % (svn_commit.symbol,) ) self._process_symbol_commit( svn_commit, 'refs/heads/%s' % (svn_commit.symbol.name,), source_groups, ) self._mirror.end_commit() def _set_symbol(self, symbol, mark): if isinstance(symbol, Branch): category = 'heads' elif isinstance(symbol, Tag): category = 'tags' else: raise InternalError() self.f.write('reset refs/%s/%s\n' % (category, symbol.name,)) self.f.write('from :%d\n' % (mark,)) def get_tag_fixup_branch_name(self, svn_commit): # The branch name to use for the "tag fixup branches". The # git-fast-import documentation suggests using 'TAG_FIXUP' # (outside of the refs/heads namespace), but this is currently # broken. Use a name containing '.', which is not allowed in CVS # symbols, to avoid conflicts (though of course a conflict could # still result if the user requests symbol transformations). return 'refs/heads/TAG.FIXUP' def process_tag_commit(self, svn_commit): # FIXME: For now we create a fixup branch with the same name as # the tag, then the tag. We never delete the fixup branch. self._mirror.start_commit(svn_commit.revnum) source_groups = self._get_source_groups(svn_commit) if self._is_simple_copy(svn_commit, source_groups): (source_revnum, source_lod, cvs_symbols) = source_groups[0] logger.debug( '%s will be created via a simple copy from %s:r%d' % (svn_commit.symbol, source_lod, source_revnum,) ) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) self._mirror.copy_lod(source_lod, svn_commit.symbol, source_revnum) self._set_lod_mark(svn_commit.symbol, svn_commit.revnum, mark) else: logger.debug( '%s will be created via a fixup branch' % (svn_commit.symbol,) ) fixup_branch_name = self.get_tag_fixup_branch_name(svn_commit) # Create the fixup branch (which might involve making more than # one commit): mark = self._process_symbol_commit( svn_commit, fixup_branch_name, source_groups ) # Store the mark of the last commit to the fixup branch as the # value of the tag: self._set_symbol(svn_commit.symbol, mark) self.f.write('reset %s\n' % (fixup_branch_name,)) self.f.write('\n') if self.tie_tag_fixup_branches: source_lod = source_groups[0][1] source_lod_git_branch = \ 'refs/heads/%s' % (getattr(source_lod, 'name', 'master'),) mark2 = self._create_commit_mark(source_lod, svn_commit.revnum) author = self._map_author(Ctx().username) log_msg = self._get_log_msg_for_ancestry_tie(svn_commit) self.f.write('commit %s\n' % (source_lod_git_branch,)) self.f.write('mark :%d\n' % (mark2,)) self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) self.f.write( 'merge :%d\n' % (mark,) ) self.f.write('\n') self._mirror.end_commit() def _get_log_msg_for_ancestry_tie(self, svn_commit): return Ctx().text_wrapper.fill( Ctx().tie_tag_ancestry_message % { 'symbol_name' : svn_commit.symbol.name, } ) def cleanup(self): DVCSOutputOption.cleanup(self) self.revision_writer.finish() self.f.close() del self.f
class GitRevisionRecorder(FulltextRevisionRecorder): """Output file revisions to git-fast-import.""" def __init__(self, blob_filename): self.blob_filename = blob_filename def start(self): self.dump_file = open(self.blob_filename, 'wb') self._mark_generator = KeyGenerator() def start_file(self, cvs_file_items): self._cvs_file_items = cvs_file_items def _get_original_source(self, cvs_rev): """Return the original source of the contents of CVS_REV. Return the first non-delete CVSRevision with the same contents as CVS_REV. 'First' here refers to deltatext order; i.e., the very first revision is HEAD on trunk, then backwards to the root of a branch, then out to the tip of a branch. The candidates are all revisions along the CVS delta-dependency chain until the next one that has a deltatext (inclusive). Of the candidates, CVSRevisionDeletes are disqualified because, even though CVS records their contents, it is impossible to extract their fulltext using commands like 'cvs checkout -p'. If there is no other CVSRevision that has the same content, return CVS_REV itself.""" # Keep track of the "best" source CVSRevision found so far: best_source_rev = None for cvs_rev in itertools.chain( [cvs_rev], self._cvs_file_items.iter_deltatext_ancestors(cvs_rev) ): if not isinstance(cvs_rev, CVSRevisionDelete): best_source_rev = cvs_rev if cvs_rev.deltatext_exists: break return best_source_rev def record_fulltext(self, cvs_rev, log, fulltext): """Write the fulltext to a blob if it is original and not a delete. The reason we go to this trouble is to avoid writing the same file contents multiple times for a string of revisions that don't have deltatexts (as, for example, happens with dead revisions and imported revisions).""" if isinstance(cvs_rev, CVSRevisionDelete): # There is no need to record a delete revision, and its token # will never be needed: return None source = self._get_original_source(cvs_rev) if source.id == cvs_rev.id: # Revision is its own source; write it out: mark = self._mark_generator.gen_id() self.dump_file.write('blob\n') self.dump_file.write('mark :%d\n' % (mark,)) self.dump_file.write('data %d\n' % (len(fulltext),)) self.dump_file.write(fulltext) self.dump_file.write('\n') return mark else: # Return as revision_recorder_token the CVSRevision.id of the # original source revision: return source.revision_recorder_token def finish_file(self, cvs_file_items): # Determine the original source of each CVSSymbol, and store it as # the symbol's revision_recorder_token. for cvs_item in cvs_file_items.values(): if isinstance(cvs_item, CVSSymbol): cvs_source = cvs_item.get_cvs_revision_source(cvs_file_items) cvs_item.revision_recorder_token = cvs_source.revision_recorder_token del self._cvs_file_items def finish(self): self.dump_file.close()
class SVNCommitCreator: """This class creates and yields SVNCommits via process_changeset().""" def __init__(self): # The revision number to assign to the next new SVNCommit. self.revnum_generator = KeyGenerator() # A set containing the Projects that have already been # initialized: self._initialized_projects = set() def _post_commit(self, cvs_revs, motivating_revnum, timestamp): """Generate any SVNCommits needed to follow CVS_REVS. That is, handle non-trunk default branches. A revision on a CVS non-trunk default branch is visible in a default CVS checkout of HEAD. So we copy such commits over to Subversion's trunk so that checking out SVN trunk gives the same output as checking out of CVS's default branch.""" cvs_revs = [ cvs_rev for cvs_rev in cvs_revs if cvs_rev.ntdbr and not isinstance(cvs_rev, CVSRevisionNoop) ] if cvs_revs: cvs_revs.sort( lambda a, b: cmp(a.cvs_file.rcs_path, b.cvs_file.rcs_path)) # Generate an SVNCommit for all of our default branch cvs_revs. yield SVNPostCommit( motivating_revnum, cvs_revs, timestamp, self.revnum_generator.gen_id(), ) def _process_revision_changeset(self, changeset, timestamp): """Process CHANGESET, using TIMESTAMP as the commit time. Create and yield one or more SVNCommits in the process. CHANGESET must be an OrderedChangeset. TIMESTAMP is used as the timestamp for any resulting SVNCommits.""" if not changeset.cvs_item_ids: logger.warn('Changeset has no items: %r' % changeset) return logger.verbose('-' * 60) logger.verbose('CVS Revision grouping:') logger.verbose(' Time: %s' % time.ctime(timestamp)) # Generate an SVNCommit unconditionally. Even if the only change in # this group of CVSRevisions is a deletion of an already-deleted # file (that is, a CVS revision in state 'dead' whose predecessor # was also in state 'dead'), the conversion will still generate a # Subversion revision containing the log message for the second dead # revision, because we don't want to lose that information. cvs_revs = list(changeset.iter_cvs_items()) if cvs_revs: cvs_revs.sort( lambda a, b: cmp(a.cvs_file.rcs_path, b.cvs_file.rcs_path)) svn_commit = SVNPrimaryCommit(cvs_revs, timestamp, self.revnum_generator.gen_id()) yield svn_commit for cvs_rev in cvs_revs: Ctx()._symbolings_logger.log_revision(cvs_rev, svn_commit.revnum) # Generate an SVNPostCommit if we have default branch revs. If # some of the revisions in this commit happened on a non-trunk # default branch, then those files have to be copied into trunk # manually after being changed on the branch (because the RCS # "default branch" appears as head, i.e., trunk, in practice). # Unfortunately, Subversion doesn't support copies with sources # in the current txn. All copies must be based in committed # revisions. Therefore, we generate the copies in a new # revision. for svn_post_commit in self._post_commit(cvs_revs, svn_commit.revnum, timestamp): yield svn_post_commit def _process_tag_changeset(self, changeset, timestamp): """Process TagChangeset CHANGESET, producing a SVNTagCommit. Filter out CVSTagNoops. If no CVSTags are left, don't generate a SVNTagCommit.""" if Ctx().trunk_only: raise InternalError( 'TagChangeset encountered during a --trunk-only conversion') cvs_tag_ids = [ cvs_tag.id for cvs_tag in changeset.iter_cvs_items() if not isinstance(cvs_tag, CVSTagNoop) ] if cvs_tag_ids: yield SVNTagCommit( changeset.symbol, cvs_tag_ids, timestamp, self.revnum_generator.gen_id(), ) else: logger.debug('Omitting %r because it contains only CVSTagNoops' % (changeset, )) def _process_branch_changeset(self, changeset, timestamp): """Process BranchChangeset CHANGESET, producing a SVNBranchCommit. Filter out CVSBranchNoops. If no CVSBranches are left, don't generate a SVNBranchCommit.""" if Ctx().trunk_only: raise InternalError( 'BranchChangeset encountered during a --trunk-only conversion') cvs_branches = [ cvs_branch for cvs_branch in changeset.iter_cvs_items() if not isinstance(cvs_branch, CVSBranchNoop) ] if cvs_branches: svn_commit = SVNBranchCommit( changeset.symbol, [cvs_branch.id for cvs_branch in cvs_branches], timestamp, self.revnum_generator.gen_id(), ) yield svn_commit for cvs_branch in cvs_branches: Ctx()._symbolings_logger.log_branch_revision( cvs_branch, svn_commit.revnum) else: logger.debug( 'Omitting %r because it contains only CVSBranchNoops' % (changeset, )) def process_changeset(self, changeset, timestamp): """Process CHANGESET, using TIMESTAMP for all of its entries. Return a generator that generates the resulting SVNCommits. The changesets must be fed to this function in proper dependency order.""" # First create any new projects that might be opened by the # changeset: projects_opened = \ changeset.get_projects_opened() - self._initialized_projects if projects_opened: if Ctx().cross_project_commits: yield SVNInitialProjectCommit(timestamp, projects_opened, self.revnum_generator.gen_id()) else: for project in projects_opened: yield SVNInitialProjectCommit( timestamp, [project], self.revnum_generator.gen_id()) self._initialized_projects.update(projects_opened) if isinstance(changeset, OrderedChangeset): for svn_commit \ in self._process_revision_changeset(changeset, timestamp): yield svn_commit elif isinstance(changeset, TagChangeset): for svn_commit in self._process_tag_changeset( changeset, timestamp): yield svn_commit elif isinstance(changeset, BranchChangeset): for svn_commit in self._process_branch_changeset( changeset, timestamp): yield svn_commit else: raise TypeError('Illegal changeset %r' % changeset)
class SVNRepositoryMirror: """Mirror a Subversion repository and its history. Mirror a Subversion repository as it is constructed, one SVNCommit at a time. For each LineOfDevelopment we store a skeleton of the directory structure within that LOD for each SVN revision number in which it changed. The creation of a dumpfile or Subversion repository is handled by delegates. See the add_delegate() method for how to set delegates. For each LOD that has been seen so far, an LODHistory instance is stored in self._lod_histories. An LODHistory keeps track of each SVNRevision in which files were added to or deleted from that LOD, as well as the node id of the node tree describing the LOD contents at that SVN revision. The LOD trees themselves are stored in the _nodes_db database, which maps node ids to nodes. A node is a map from CVSPath.id to ids of the corresponding subnodes. The _nodes_db is stored on disk and each access is expensive. The _nodes_db database only holds the nodes for old revisions. The revision that is being constructed is kept in memory in the _new_nodes map, which is cheap to access. You must invoke start_commit() before each SVNCommit and end_commit() afterwards. *** WARNING *** Path arguments to methods in this class MUST NOT have leading or trailing slashes.""" class ParentMissingError(Exception): """The parent of a path is missing. Exception raised if an attempt is made to add a path to the repository mirror but the parent's path doesn't exist in the youngest revision of the repository.""" pass class PathExistsError(Exception): """The path already exists in the repository. Exception raised if an attempt is made to add a path to the repository mirror and that path already exists in the youngest revision of the repository.""" pass def register_artifacts(self, which_pass): """Register the artifacts that will be needed for this object.""" artifact_manager.register_temp_file( config.SVN_MIRROR_NODES_INDEX_TABLE, which_pass ) artifact_manager.register_temp_file( config.SVN_MIRROR_NODES_STORE, which_pass ) def open(self): """Set up the SVNRepositoryMirror and prepare it for SVNCommits.""" self._key_generator = KeyGenerator() self._delegates = [ ] # A map from LOD to LODHistory instance for all LODs that have # been defines so far: self._lod_histories = {} # This corresponds to the 'nodes' table in a Subversion fs. (We # don't need a 'representations' or 'strings' table because we # only track metadata, not file contents.) self._nodes_db = IndexedDatabase( artifact_manager.get_temp_file(config.SVN_MIRROR_NODES_STORE), artifact_manager.get_temp_file(config.SVN_MIRROR_NODES_INDEX_TABLE), DB_OPEN_NEW, serializer=_NodeSerializer() ) # Start at revision 0 without a root node. It will be created # by _open_writable_root_node. self._youngest = 0 def start_commit(self, revnum, revprops): """Start a new commit.""" self._youngest = revnum # A map {node_id : {CVSPath : node_id}}. self._new_nodes = {} self._invoke_delegates('start_commit', revnum, revprops) def end_commit(self): """Called at the end of each commit. This method copies the newly created nodes to the on-disk nodes db.""" # Copy the new nodes to the _nodes_db for id, value in self._new_nodes.items(): self._nodes_db[id] = value del self._new_nodes self._invoke_delegates('end_commit') def _get_lod_history(self, lod): """Return the LODHistory instance describing LOD. Create a new (empty) LODHistory if it doesn't yet exist.""" try: return self._lod_histories[lod] except KeyError: lod_history = LODHistory() self._lod_histories[lod] = lod_history return lod_history def _create_empty_node(self): """Create and return a new, empty, writable node.""" new_node = _WritableMirrorNode(self, self._key_generator.gen_id(), {}) self._new_nodes[new_node.id] = new_node.entries return new_node def _copy_node(self, old_node): """Create and return a new, writable node that is a copy of OLD_NODE.""" new_node = _WritableMirrorNode( self, self._key_generator.gen_id(), old_node.entries.copy() ) self._new_nodes[new_node.id] = new_node.entries return new_node def _get_node(self, id): """Return the node for id ID. The node might be read from either self._nodes_db or self._new_nodes. Return an instance of _MirrorNode.""" try: return _WritableMirrorNode(self, id, self._new_nodes[id]) except KeyError: return _ReadOnlyMirrorNode(self, id, self._nodes_db[id]) def _open_readonly_lod_node(self, lod, revnum): """Open a readonly node for the root path of LOD at revision REVNUM. Return an instance of _MirrorNode if the path exists; otherwise, raise KeyError.""" lod_history = self._get_lod_history(lod) node_id = lod_history.get_id(revnum) return self._get_node(node_id) def _open_readonly_node(self, cvs_path, lod, revnum): """Open a readonly node for CVS_PATH from LOD at REVNUM. If cvs_path refers to a leaf node, return None. Raise KeyError if the node does not exist.""" if cvs_path.parent_directory is None: return self._open_readonly_lod_node(lod, revnum) else: parent_node = self._open_readonly_node( cvs_path.parent_directory, lod, revnum ) return parent_node[cvs_path] def _open_writable_lod_node(self, lod, create, invoke_delegates=True): """Open a writable node for the root path in LOD. Iff CREATE is True, create the path and any missing directories. Return an instance of _WritableMirrorNode. Raise KeyError if the path doesn't already exist and CREATE is not set.""" lod_history = self._get_lod_history(lod) try: id = lod_history.get_id() except KeyError: if create: node = self._create_empty_node() lod_history.update(self._youngest, node.id) if invoke_delegates: self._invoke_delegates('initialize_lod', lod) else: raise else: node = self._get_node(id) if not isinstance(node, _WritableMirrorNode): # Node was created in an earlier revision, so we have to copy # it to make it writable: node = self._copy_node(node) lod_history.update(self._youngest, node.id) return node def _open_writable_node(self, cvs_directory, lod, create): """Open a writable node for CVS_DIRECTORY in LOD. Iff CREATE is True, create a directory node at SVN_PATH and any missing directories. Return an instance of _WritableMirrorNode. Raise KeyError if CVS_DIRECTORY doesn't exist and CREATE is not set.""" if cvs_directory.parent_directory is None: return self._open_writable_lod_node(lod, create) parent_node = self._open_writable_node( cvs_directory.parent_directory, lod, create ) try: node = parent_node[cvs_directory] except KeyError: if create: # The component does not exist, so we create it. new_node = self._create_empty_node() parent_node[cvs_directory] = new_node self._invoke_delegates('mkdir', lod, cvs_directory) return new_node else: raise else: if isinstance(node, _WritableMirrorNode): return node elif isinstance(node, _ReadOnlyMirrorNode): new_node = self._copy_node(node) parent_node[cvs_directory] = new_node return new_node else: raise InternalError( 'Attempt to modify file at %s in mirror' % (cvs_directory,) ) def delete_lod(self, lod): """Delete the main path for LOD from the tree. The path must currently exist. Silently refuse to delete trunk paths.""" if isinstance(lod, Trunk): # Never delete a Trunk path. return lod_history = self._get_lod_history(lod) if not lod_history.exists(): raise KeyError() lod_history.update(self._youngest, None) self._invoke_delegates('delete_lod', lod) def delete_path(self, cvs_path, lod, should_prune=False): """Delete CVS_PATH from LOD.""" if cvs_path.parent_directory is None: self.delete_lod(lod) return else: parent_node = self._open_writable_node( cvs_path.parent_directory, lod, False ) del parent_node[cvs_path] self._invoke_delegates('delete_path', lod, cvs_path) # The following recursion makes pruning an O(n^2) operation in the # worst case (where n is the depth of SVN_PATH), but the worst case # is probably rare, and the constant cost is pretty low. Another # drawback is that we issue a delete for each path and not just # a single delete for the topmost directory pruned. if should_prune and len(parent_node) == 0: self.delete_path(cvs_path.parent_directory, lod, True) def initialize_project(self, project): """Create the basic structure for PROJECT.""" self._invoke_delegates('initialize_project', project) self._open_writable_lod_node( Ctx()._symbol_db.get_symbol(project.trunk_id), create=True, invoke_delegates=False ) def change_path(self, cvs_rev): """Register a change in self._youngest for the CVS_REV's svn_path.""" # We do not have to update the nodes because our mirror is only # concerned with the presence or absence of paths, and a file # content change does not cause any path changes. self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False)) def add_path(self, cvs_rev): """Add the CVS_REV's svn_path to the repository mirror.""" cvs_file = cvs_rev.cvs_file parent_node = self._open_writable_node( cvs_file.parent_directory, cvs_rev.lod, True ) if cvs_file in parent_node: raise self.PathExistsError( 'Attempt to add path \'%s\' to repository mirror ' 'when it already exists in the mirror.' % (cvs_rev.get_svn_path(),) ) parent_node[cvs_file] = None self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True)) def copy_lod(self, src_lod, dest_lod, src_revnum): """Copy all of SRC_LOD at SRC_REVNUM to DST_LOD. In the youngest revision of the repository, the destination LOD *must not* already exist. Return the new node at DEST_LOD. Note that this node is not necessarily writable, though its parent node necessarily is.""" dest_path = dest_lod.get_path() # Get the node of our src_path src_node = self._open_readonly_lod_node(src_lod, src_revnum) dest_lod_history = self._get_lod_history(dest_lod) if dest_lod_history.exists(): raise self.PathExistsError( "Attempt to add path '%s' to repository mirror " "when it already exists in the mirror." % dest_path ) dest_lod_history.update(self._youngest, src_node.id) self._invoke_delegates('copy_lod', src_lod, dest_lod, src_revnum) # This is a cheap copy, so src_node has the same contents as the # new destination node. return src_node def copy_path( self, cvs_path, src_lod, dest_lod, src_revnum, create_parent=False ): """Copy CVS_PATH from SRC_LOD at SRC_REVNUM to DST_LOD. In the youngest revision of the repository, the destination's parent *must* exist unless CREATE_PARENT is specified. But the destination itself *must not* exist. Return the new node at (CVS_PATH, DEST_LOD). Note that this node is not necessarily writable, though its parent node necessarily is.""" if cvs_path.parent_directory is None: return self.copy_lod(src_lod, dest_lod, src_revnum) # Get the node of our source, or None if it is a file: src_node = self._open_readonly_node(cvs_path, src_lod, src_revnum) # Get the parent path of the destination: try: dest_parent_node = self._open_writable_node( cvs_path.parent_directory, dest_lod, create_parent ) except KeyError: raise self.ParentMissingError( 'Attempt to add path \'%s\' to repository mirror, ' 'but its parent directory doesn\'t exist in the mirror.' % (dest_lod.get_path(cvs_path.cvs_path),) ) if cvs_path in dest_parent_node: raise self.PathExistsError( 'Attempt to add path \'%s\' to repository mirror ' 'when it already exists in the mirror.' % (dest_lod.get_path(cvs_path.cvs_path),) ) dest_parent_node[cvs_path] = src_node self._invoke_delegates( 'copy_path', src_lod.get_path(cvs_path.cvs_path), dest_lod.get_path(cvs_path.cvs_path), src_revnum ) # This is a cheap copy, so src_node has the same contents as the # new destination node. return src_node def fill_symbol(self, svn_symbol_commit, fill_source): """Perform all copies for the CVSSymbols in SVN_SYMBOL_COMMIT. The symbolic name is guaranteed to exist in the Subversion repository by the end of this call, even if there are no paths under it.""" symbol = svn_symbol_commit.symbol try: dest_node = self._open_writable_lod_node(symbol, False) except KeyError: dest_node = None self._fill_directory(symbol, dest_node, fill_source, None) def _prune_extra_entries( self, dest_cvs_path, symbol, dest_node, src_entries ): """Delete any entries in DEST_NODE that are not in SRC_ENTRIES. This might require creating a new writable node, so return a possibly-modified dest_node.""" delete_list = [ cvs_path for cvs_path in dest_node if cvs_path not in src_entries ] if delete_list: if not isinstance(dest_node, _WritableMirrorNode): dest_node = self._open_writable_node(dest_cvs_path, symbol, False) # Sort the delete list so that the output is in a consistent # order: delete_list.sort() for cvs_path in delete_list: del dest_node[cvs_path] self._invoke_delegates('delete_path', symbol, cvs_path) return dest_node def _fill_directory(self, symbol, dest_node, fill_source, parent_source): """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. Use items from FILL_SOURCE, and recurse into the child items. Fill SYMBOL starting at the path FILL_SOURCE.cvs_path. DEST_NODE is the node of this destination path, or None if the destination does not yet exist. All directories above this path have already been filled. FILL_SOURCE is a FillSource instance describing the items within a subtree of the repository that still need to be copied to the destination. PARENT_SOURCE is the SVNRevisionRange that was used to copy the parent directory, if it was copied in this commit. We prefer to copy from the same source as was used for the parent, since it typically requires less touching-up. If PARENT_SOURCE is None, then the parent directory was not copied in this commit, so no revision is preferable to any other.""" copy_source = fill_source.compute_best_source(parent_source) # Figure out if we shall copy to this destination and delete any # destination path that is in the way. if dest_node is None: # The destination does not exist at all, so it definitely has to # be copied: dest_node = self.copy_path( fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum ) elif (parent_source is not None) and ( copy_source.source_lod != parent_source.source_lod or copy_source.opening_revnum != parent_source.opening_revnum ): # The parent path was copied from a different source than we # need to use, so we have to delete the version that was copied # with the parent then re-copy from the correct source: self.delete_path(fill_source.cvs_path, symbol) dest_node = self.copy_path( fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum ) else: copy_source = parent_source # Get the map {entry : FillSource} for entries within this # directory that need filling. src_entries = {} for (cvs_path, fill_subsource) in fill_source.get_subsources(): src_entries[cvs_path] = fill_subsource if copy_source is not None: dest_node = self._prune_extra_entries( fill_source.cvs_path, symbol, dest_node, src_entries ) # Recurse into the SRC_ENTRIES ids sorted in alphabetical order. cvs_paths = src_entries.keys() cvs_paths.sort() for cvs_path in cvs_paths: if isinstance(cvs_path, CVSDirectory): # Path is a CVSDirectory: try: dest_subnode = dest_node[cvs_path] except KeyError: # Path didn't exist at all; it has to be created: dest_subnode = None self._fill_directory( symbol, dest_subnode, src_entries[cvs_path], copy_source ) else: # Path is a CVSFile: self._fill_file( symbol, cvs_path in dest_node, src_entries[cvs_path], copy_source ) def _fill_file(self, symbol, dest_existed, fill_source, parent_source): """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. Use items from FILL_SOURCE. Fill SYMBOL at path FILL_SOURCE.cvs_path. DEST_NODE is the node of this destination path, or None if the destination does not yet exist. All directories above this path have already been filled as needed. FILL_SOURCE is a FillSource instance describing the item that needs to be copied to the destination. PARENT_SOURCE is the source from which the parent directory was copied, or None if the parent directory was not copied during this commit. We prefer to copy from PARENT_SOURCE, since it typically requires less touching-up. If PARENT_SOURCE is None, then the parent directory was not copied in this commit, so no revision is preferable to any other.""" copy_source = fill_source.compute_best_source(parent_source) # Figure out if we shall copy to this destination and delete any # destination path that is in the way. if not dest_existed: # The destination does not exist at all, so it definitely has to # be copied: self.copy_path( fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum ) elif (parent_source is not None) and ( copy_source.source_lod != parent_source.source_lod or copy_source.opening_revnum != parent_source.opening_revnum ): # The parent path was copied from a different source than we # need to use, so we have to delete the version that was copied # with the parent and then re-copy from the correct source: self.delete_path(fill_source.cvs_path, symbol) self.copy_path( fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum ) def add_delegate(self, delegate): """Adds DELEGATE to self._delegates. For every delegate you add, as soon as SVNRepositoryMirror performs a repository action method, SVNRepositoryMirror will call the delegate's corresponding repository action method. Multiple delegates will be called in the order that they are added. See SVNRepositoryMirrorDelegate for more information.""" self._delegates.append(delegate) def _invoke_delegates(self, method, *args): """Invoke a method on each delegate. Iterate through each of our delegates, in the order that they were added, and call the delegate's method named METHOD with the arguments in ARGS.""" for delegate in self._delegates: getattr(delegate, method)(*args) def close(self): """Call the delegate finish methods and close databases.""" self._invoke_delegates('finish') self._lod_histories = None self._nodes_db.close() self._nodes_db = None
class GitRevisionCollector(RevisionCollector): """Output file revisions to git-fast-import.""" def __init__(self, revision_reader, blob_filename=None): self.revision_reader = revision_reader self.blob_filename = blob_filename def register_artifacts(self, which_pass): self.revision_reader.register_artifacts(which_pass) if self.blob_filename is None: artifact_manager.register_temp_file( config.GIT_BLOB_DATAFILE, which_pass, ) def start(self): self.revision_reader.start() if self.blob_filename is None: self.dump_file = open( artifact_manager.get_temp_file(config.GIT_BLOB_DATAFILE), 'wb', ) else: self.dump_file = open(self.blob_filename, 'wb') self._mark_generator = KeyGenerator() def _process_revision(self, cvs_rev): """Write the revision fulltext to a blob if it is not dead.""" if isinstance(cvs_rev, CVSRevisionDelete): # There is no need to record a delete revision, and its token # will never be needed: return # FIXME: We have to decide what to do about keyword substitution # and eol_style here: fulltext = self.revision_reader.get_content(cvs_rev) mark = self._mark_generator.gen_id() self.dump_file.write('blob\n') self.dump_file.write('mark :%d\n' % (mark,)) self.dump_file.write('data %d\n' % (len(fulltext),)) self.dump_file.write(fulltext) self.dump_file.write('\n') cvs_rev.revision_reader_token = mark def _process_symbol(self, cvs_symbol, cvs_file_items): """Record the original source of CVS_SYMBOL. Determine the original revision source of CVS_SYMBOL, and store it as the symbol's revision_reader_token.""" cvs_source = cvs_symbol.get_cvs_revision_source(cvs_file_items) cvs_symbol.revision_reader_token = cvs_source.revision_reader_token def process_file(self, cvs_file_items): for lod_items in cvs_file_items.iter_lods(): for cvs_rev in lod_items.cvs_revisions: self._process_revision(cvs_rev) # Now that all CVSRevisions' revision_reader_tokens are set, # iterate through symbols and set their tokens to those of their # original source revisions: for lod_items in cvs_file_items.iter_lods(): if lod_items.cvs_branch is not None: self._process_symbol(lod_items.cvs_branch, cvs_file_items) for cvs_tag in lod_items.cvs_tags: self._process_symbol(cvs_tag, cvs_file_items) def finish(self): self.revision_reader.finish() self.dump_file.close()
class SVNRepositoryMirror: """Mirror a Subversion repository and its history. Mirror a Subversion repository as it is constructed, one SVNCommit at a time. For each LineOfDevelopment we store a skeleton of the directory structure within that LOD for each SVN revision number in which it changed. The creation of a dumpfile or Subversion repository is handled by delegates. See the add_delegate() method for how to set delegates. For each LOD that has been seen so far, an LODHistory instance is stored in self._lod_histories. An LODHistory keeps track of each SVNRevision in which files were added to or deleted from that LOD, as well as the node id of the node tree describing the LOD contents at that SVN revision. The LOD trees themselves are stored in the _nodes_db database, which maps node ids to nodes. A node is a map from CVSPath.id to ids of the corresponding subnodes. The _nodes_db is stored on disk and each access is expensive. The _nodes_db database only holds the nodes for old revisions. The revision that is being constructed is kept in memory in the _new_nodes map, which is cheap to access. You must invoke start_commit() before each SVNCommit and end_commit() afterwards. *** WARNING *** Path arguments to methods in this class MUST NOT have leading or trailing slashes.""" class ParentMissingError(Exception): """The parent of a path is missing. Exception raised if an attempt is made to add a path to the repository mirror but the parent's path doesn't exist in the youngest revision of the repository.""" pass class PathExistsError(Exception): """The path already exists in the repository. Exception raised if an attempt is made to add a path to the repository mirror and that path already exists in the youngest revision of the repository.""" pass def register_artifacts(self, which_pass): """Register the artifacts that will be needed for this object.""" artifact_manager.register_temp_file( config.SVN_MIRROR_NODES_INDEX_TABLE, which_pass) artifact_manager.register_temp_file(config.SVN_MIRROR_NODES_STORE, which_pass) def open(self): """Set up the SVNRepositoryMirror and prepare it for SVNCommits.""" self._key_generator = KeyGenerator() self._delegates = [] # A map from LOD to LODHistory instance for all LODs that have # been defines so far: self._lod_histories = {} # This corresponds to the 'nodes' table in a Subversion fs. (We # don't need a 'representations' or 'strings' table because we # only track metadata, not file contents.) self._nodes_db = IndexedDatabase( artifact_manager.get_temp_file(config.SVN_MIRROR_NODES_STORE), artifact_manager.get_temp_file( config.SVN_MIRROR_NODES_INDEX_TABLE), DB_OPEN_NEW, serializer=_NodeSerializer()) # Start at revision 0 without a root node. It will be created # by _open_writable_root_node. self._youngest = 0 def start_commit(self, revnum, revprops): """Start a new commit.""" self._youngest = revnum # A map {node_id : {CVSPath : node_id}}. self._new_nodes = {} self._invoke_delegates('start_commit', revnum, revprops) def end_commit(self): """Called at the end of each commit. This method copies the newly created nodes to the on-disk nodes db.""" # Copy the new nodes to the _nodes_db for id, value in self._new_nodes.items(): self._nodes_db[id] = value del self._new_nodes self._invoke_delegates('end_commit') def _get_lod_history(self, lod): """Return the LODHistory instance describing LOD. Create a new (empty) LODHistory if it doesn't yet exist.""" try: return self._lod_histories[lod] except KeyError: lod_history = LODHistory() self._lod_histories[lod] = lod_history return lod_history def _create_empty_node(self): """Create and return a new, empty, writable node.""" new_node = _WritableMirrorNode(self, self._key_generator.gen_id(), {}) self._new_nodes[new_node.id] = new_node.entries return new_node def _copy_node(self, old_node): """Create and return a new, writable node that is a copy of OLD_NODE.""" new_node = _WritableMirrorNode(self, self._key_generator.gen_id(), old_node.entries.copy()) self._new_nodes[new_node.id] = new_node.entries return new_node def _get_node(self, id): """Return the node for id ID. The node might be read from either self._nodes_db or self._new_nodes. Return an instance of _MirrorNode.""" try: return _WritableMirrorNode(self, id, self._new_nodes[id]) except KeyError: return _ReadOnlyMirrorNode(self, id, self._nodes_db[id]) def _open_readonly_lod_node(self, lod, revnum): """Open a readonly node for the root path of LOD at revision REVNUM. Return an instance of _MirrorNode if the path exists; otherwise, raise KeyError.""" lod_history = self._get_lod_history(lod) node_id = lod_history.get_id(revnum) return self._get_node(node_id) def _open_readonly_node(self, cvs_path, lod, revnum): """Open a readonly node for CVS_PATH from LOD at REVNUM. If cvs_path refers to a leaf node, return None. Raise KeyError if the node does not exist.""" if cvs_path.parent_directory is None: return self._open_readonly_lod_node(lod, revnum) else: parent_node = self._open_readonly_node(cvs_path.parent_directory, lod, revnum) return parent_node[cvs_path] def _open_writable_lod_node(self, lod, create, invoke_delegates=True): """Open a writable node for the root path in LOD. Iff CREATE is True, create the path and any missing directories. Return an instance of _WritableMirrorNode. Raise KeyError if the path doesn't already exist and CREATE is not set.""" lod_history = self._get_lod_history(lod) try: id = lod_history.get_id() except KeyError: if create: node = self._create_empty_node() lod_history.update(self._youngest, node.id) if invoke_delegates: self._invoke_delegates('initialize_lod', lod) else: raise else: node = self._get_node(id) if not isinstance(node, _WritableMirrorNode): # Node was created in an earlier revision, so we have to copy # it to make it writable: node = self._copy_node(node) lod_history.update(self._youngest, node.id) return node def _open_writable_node(self, cvs_directory, lod, create): """Open a writable node for CVS_DIRECTORY in LOD. Iff CREATE is True, create a directory node at SVN_PATH and any missing directories. Return an instance of _WritableMirrorNode. Raise KeyError if CVS_DIRECTORY doesn't exist and CREATE is not set.""" if cvs_directory.parent_directory is None: return self._open_writable_lod_node(lod, create) parent_node = self._open_writable_node(cvs_directory.parent_directory, lod, create) try: node = parent_node[cvs_directory] except KeyError: if create: # The component does not exist, so we create it. new_node = self._create_empty_node() parent_node[cvs_directory] = new_node self._invoke_delegates('mkdir', lod, cvs_directory) return new_node else: raise else: if isinstance(node, _WritableMirrorNode): return node elif isinstance(node, _ReadOnlyMirrorNode): new_node = self._copy_node(node) parent_node[cvs_directory] = new_node return new_node else: raise InternalError('Attempt to modify file at %s in mirror' % (cvs_directory, )) def delete_lod(self, lod): """Delete the main path for LOD from the tree. The path must currently exist. Silently refuse to delete trunk paths.""" if isinstance(lod, Trunk): # Never delete a Trunk path. return lod_history = self._get_lod_history(lod) if not lod_history.exists(): raise KeyError() lod_history.update(self._youngest, None) self._invoke_delegates('delete_lod', lod) def delete_path(self, cvs_path, lod, should_prune=False): """Delete CVS_PATH from LOD.""" if cvs_path.parent_directory is None: self.delete_lod(lod) return else: parent_node = self._open_writable_node(cvs_path.parent_directory, lod, False) del parent_node[cvs_path] self._invoke_delegates('delete_path', lod, cvs_path) # The following recursion makes pruning an O(n^2) operation in the # worst case (where n is the depth of SVN_PATH), but the worst case # is probably rare, and the constant cost is pretty low. Another # drawback is that we issue a delete for each path and not just # a single delete for the topmost directory pruned. if should_prune and len(parent_node) == 0: self.delete_path(cvs_path.parent_directory, lod, True) def initialize_project(self, project): """Create the basic structure for PROJECT.""" self._invoke_delegates('initialize_project', project) self._open_writable_lod_node(Ctx()._symbol_db.get_symbol( project.trunk_id), create=True, invoke_delegates=False) def change_path(self, cvs_rev): """Register a change in self._youngest for the CVS_REV's svn_path.""" # We do not have to update the nodes because our mirror is only # concerned with the presence or absence of paths, and a file # content change does not cause any path changes. self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False)) def add_path(self, cvs_rev): """Add the CVS_REV's svn_path to the repository mirror.""" cvs_file = cvs_rev.cvs_file parent_node = self._open_writable_node(cvs_file.parent_directory, cvs_rev.lod, True) if cvs_file in parent_node: raise self.PathExistsError( 'Attempt to add path \'%s\' to repository mirror ' 'when it already exists in the mirror.' % (cvs_rev.get_svn_path(), )) parent_node[cvs_file] = None self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True)) def copy_lod(self, src_lod, dest_lod, src_revnum): """Copy all of SRC_LOD at SRC_REVNUM to DST_LOD. In the youngest revision of the repository, the destination LOD *must not* already exist. Return the new node at DEST_LOD. Note that this node is not necessarily writable, though its parent node necessarily is.""" dest_path = dest_lod.get_path() # Get the node of our src_path src_node = self._open_readonly_lod_node(src_lod, src_revnum) dest_lod_history = self._get_lod_history(dest_lod) if dest_lod_history.exists(): raise self.PathExistsError( "Attempt to add path '%s' to repository mirror " "when it already exists in the mirror." % dest_path) dest_lod_history.update(self._youngest, src_node.id) self._invoke_delegates('copy_lod', src_lod, dest_lod, src_revnum) # This is a cheap copy, so src_node has the same contents as the # new destination node. return src_node def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum, create_parent=False): """Copy CVS_PATH from SRC_LOD at SRC_REVNUM to DST_LOD. In the youngest revision of the repository, the destination's parent *must* exist unless CREATE_PARENT is specified. But the destination itself *must not* exist. Return the new node at (CVS_PATH, DEST_LOD). Note that this node is not necessarily writable, though its parent node necessarily is.""" if cvs_path.parent_directory is None: return self.copy_lod(src_lod, dest_lod, src_revnum) # Get the node of our source, or None if it is a file: src_node = self._open_readonly_node(cvs_path, src_lod, src_revnum) # Get the parent path of the destination: try: dest_parent_node = self._open_writable_node( cvs_path.parent_directory, dest_lod, create_parent) except KeyError: raise self.ParentMissingError( 'Attempt to add path \'%s\' to repository mirror, ' 'but its parent directory doesn\'t exist in the mirror.' % (dest_lod.get_path(cvs_path.cvs_path), )) if cvs_path in dest_parent_node: raise self.PathExistsError( 'Attempt to add path \'%s\' to repository mirror ' 'when it already exists in the mirror.' % (dest_lod.get_path(cvs_path.cvs_path), )) dest_parent_node[cvs_path] = src_node self._invoke_delegates('copy_path', src_lod.get_path(cvs_path.cvs_path), dest_lod.get_path(cvs_path.cvs_path), src_revnum) # This is a cheap copy, so src_node has the same contents as the # new destination node. return src_node def fill_symbol(self, svn_symbol_commit, fill_source): """Perform all copies for the CVSSymbols in SVN_SYMBOL_COMMIT. The symbolic name is guaranteed to exist in the Subversion repository by the end of this call, even if there are no paths under it.""" symbol = svn_symbol_commit.symbol try: dest_node = self._open_writable_lod_node(symbol, False) except KeyError: dest_node = None self._fill_directory(symbol, dest_node, fill_source, None) def _prune_extra_entries(self, dest_cvs_path, symbol, dest_node, src_entries): """Delete any entries in DEST_NODE that are not in SRC_ENTRIES. This might require creating a new writable node, so return a possibly-modified dest_node.""" delete_list = [ cvs_path for cvs_path in dest_node if cvs_path not in src_entries ] if delete_list: if not isinstance(dest_node, _WritableMirrorNode): dest_node = self._open_writable_node(dest_cvs_path, symbol, False) # Sort the delete list so that the output is in a consistent # order: delete_list.sort() for cvs_path in delete_list: del dest_node[cvs_path] self._invoke_delegates('delete_path', symbol, cvs_path) return dest_node def _fill_directory(self, symbol, dest_node, fill_source, parent_source): """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. Use items from FILL_SOURCE, and recurse into the child items. Fill SYMBOL starting at the path FILL_SOURCE.cvs_path. DEST_NODE is the node of this destination path, or None if the destination does not yet exist. All directories above this path have already been filled. FILL_SOURCE is a FillSource instance describing the items within a subtree of the repository that still need to be copied to the destination. PARENT_SOURCE is the SVNRevisionRange that was used to copy the parent directory, if it was copied in this commit. We prefer to copy from the same source as was used for the parent, since it typically requires less touching-up. If PARENT_SOURCE is None, then the parent directory was not copied in this commit, so no revision is preferable to any other.""" copy_source = fill_source.compute_best_source(parent_source) # Figure out if we shall copy to this destination and delete any # destination path that is in the way. if dest_node is None: # The destination does not exist at all, so it definitely has to # be copied: dest_node = self.copy_path(fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum) elif (parent_source is not None) and ( copy_source.source_lod != parent_source.source_lod or copy_source.opening_revnum != parent_source.opening_revnum): # The parent path was copied from a different source than we # need to use, so we have to delete the version that was copied # with the parent then re-copy from the correct source: self.delete_path(fill_source.cvs_path, symbol) dest_node = self.copy_path(fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum) else: copy_source = parent_source # Get the map {entry : FillSource} for entries within this # directory that need filling. src_entries = {} for (cvs_path, fill_subsource) in fill_source.get_subsources(): src_entries[cvs_path] = fill_subsource if copy_source is not None: dest_node = self._prune_extra_entries(fill_source.cvs_path, symbol, dest_node, src_entries) # Recurse into the SRC_ENTRIES ids sorted in alphabetical order. cvs_paths = src_entries.keys() cvs_paths.sort() for cvs_path in cvs_paths: if isinstance(cvs_path, CVSDirectory): # Path is a CVSDirectory: try: dest_subnode = dest_node[cvs_path] except KeyError: # Path didn't exist at all; it has to be created: dest_subnode = None self._fill_directory(symbol, dest_subnode, src_entries[cvs_path], copy_source) else: # Path is a CVSFile: self._fill_file(symbol, cvs_path in dest_node, src_entries[cvs_path], copy_source) def _fill_file(self, symbol, dest_existed, fill_source, parent_source): """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. Use items from FILL_SOURCE. Fill SYMBOL at path FILL_SOURCE.cvs_path. DEST_NODE is the node of this destination path, or None if the destination does not yet exist. All directories above this path have already been filled as needed. FILL_SOURCE is a FillSource instance describing the item that needs to be copied to the destination. PARENT_SOURCE is the source from which the parent directory was copied, or None if the parent directory was not copied during this commit. We prefer to copy from PARENT_SOURCE, since it typically requires less touching-up. If PARENT_SOURCE is None, then the parent directory was not copied in this commit, so no revision is preferable to any other.""" copy_source = fill_source.compute_best_source(parent_source) # Figure out if we shall copy to this destination and delete any # destination path that is in the way. if not dest_existed: # The destination does not exist at all, so it definitely has to # be copied: self.copy_path(fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum) elif (parent_source is not None) and ( copy_source.source_lod != parent_source.source_lod or copy_source.opening_revnum != parent_source.opening_revnum): # The parent path was copied from a different source than we # need to use, so we have to delete the version that was copied # with the parent and then re-copy from the correct source: self.delete_path(fill_source.cvs_path, symbol) self.copy_path(fill_source.cvs_path, copy_source.source_lod, symbol, copy_source.opening_revnum) def add_delegate(self, delegate): """Adds DELEGATE to self._delegates. For every delegate you add, as soon as SVNRepositoryMirror performs a repository action method, SVNRepositoryMirror will call the delegate's corresponding repository action method. Multiple delegates will be called in the order that they are added. See SVNRepositoryMirrorDelegate for more information.""" self._delegates.append(delegate) def _invoke_delegates(self, method, *args): """Invoke a method on each delegate. Iterate through each of our delegates, in the order that they were added, and call the delegate's method named METHOD with the arguments in ARGS.""" for delegate in self._delegates: getattr(delegate, method)(*args) def close(self): """Call the delegate finish methods and close databases.""" self._invoke_delegates('finish') self._lod_histories = None self._nodes_db.close() self._nodes_db = None
class GitRevisionRecorder(FulltextRevisionRecorder): """Output file revisions to git-fast-import.""" def __init__(self, blob_filename): self.blob_filename = blob_filename def start(self): self.dump_file = open(self.blob_filename, 'wb') self._mark_generator = KeyGenerator() def start_file(self, cvs_file_items): self._cvs_file_items = cvs_file_items def _get_original_source(self, cvs_rev): """Return the original source of the contents of CVS_REV. Return the first non-delete CVSRevision with the same contents as CVS_REV. 'First' here refers to deltatext order; i.e., the very first revision is HEAD on trunk, then backwards to the root of a branch, then out to the tip of a branch. The candidates are all revisions along the CVS delta-dependency chain until the next one that has a deltatext (inclusive). Of the candidates, CVSRevisionDeletes are disqualified because, even though CVS records their contents, it is impossible to extract their fulltext using commands like 'cvs checkout -p'. If there is no other CVSRevision that has the same content, return CVS_REV itself.""" # Keep track of the "best" source CVSRevision found so far: best_source_rev = None for cvs_rev in itertools.chain( [cvs_rev], self._cvs_file_items.iter_deltatext_ancestors(cvs_rev)): if not isinstance(cvs_rev, CVSRevisionDelete): best_source_rev = cvs_rev if cvs_rev.deltatext_exists: break return best_source_rev def record_fulltext(self, cvs_rev, log, fulltext): """Write the fulltext to a blob if it is original and not a delete. The reason we go to this trouble is to avoid writing the same file contents multiple times for a string of revisions that don't have deltatexts (as, for example, happens with dead revisions and imported revisions).""" if isinstance(cvs_rev, CVSRevisionDelete): # There is no need to record a delete revision, and its token # will never be needed: return None source = self._get_original_source(cvs_rev) if source.id == cvs_rev.id: # Revision is its own source; write it out: mark = self._mark_generator.gen_id() self.dump_file.write('blob\n') self.dump_file.write('mark :%d\n' % (mark, )) self.dump_file.write('data %d\n' % (len(fulltext), )) self.dump_file.write(fulltext) self.dump_file.write('\n') return mark else: # Return as revision_recorder_token the CVSRevision.id of the # original source revision: return source.revision_recorder_token def finish_file(self, cvs_file_items): # Determine the original source of each CVSSymbol, and store it as # the symbol's revision_recorder_token. for cvs_item in cvs_file_items.values(): if isinstance(cvs_item, CVSSymbol): cvs_source = cvs_item.get_cvs_revision_source(cvs_file_items) cvs_item.revision_recorder_token = cvs_source.revision_recorder_token del self._cvs_file_items def finish(self): self.dump_file.close()
class SVNCommitCreator: """This class creates and yields SVNCommits via process_changeset().""" def __init__(self): # The revision number to assign to the next new SVNCommit. self.revnum_generator = KeyGenerator() # A set containing the Projects that have already been # initialized: self._initialized_projects = set() def _post_commit(self, cvs_revs, motivating_revnum, timestamp): """Generate any SVNCommits needed to follow CVS_REVS. That is, handle non-trunk default branches. A revision on a CVS non-trunk default branch is visible in a default CVS checkout of HEAD. So we copy such commits over to Subversion's trunk so that checking out SVN trunk gives the same output as checking out of CVS's default branch.""" cvs_revs = [ cvs_rev for cvs_rev in cvs_revs if cvs_rev.ntdbr and not isinstance(cvs_rev, CVSRevisionNoop) ] if cvs_revs: cvs_revs.sort( lambda a, b: cmp(a.cvs_file.rcs_path, b.cvs_file.rcs_path) ) # Generate an SVNCommit for all of our default branch cvs_revs. yield SVNPostCommit( motivating_revnum, cvs_revs, timestamp, self.revnum_generator.gen_id(), ) def _process_revision_changeset(self, changeset, timestamp): """Process CHANGESET, using TIMESTAMP as the commit time. Create and yield one or more SVNCommits in the process. CHANGESET must be an OrderedChangeset. TIMESTAMP is used as the timestamp for any resulting SVNCommits.""" if not changeset.cvs_item_ids: logger.warn('Changeset has no items: %r' % changeset) return logger.verbose('-' * 60) logger.verbose('CVS Revision grouping:') logger.verbose(' Time: %s' % time.ctime(timestamp)) # Generate an SVNCommit unconditionally. Even if the only change in # this group of CVSRevisions is a deletion of an already-deleted # file (that is, a CVS revision in state 'dead' whose predecessor # was also in state 'dead'), the conversion will still generate a # Subversion revision containing the log message for the second dead # revision, because we don't want to lose that information. cvs_revs = list(changeset.iter_cvs_items()) if cvs_revs: cvs_revs.sort(lambda a, b: cmp(a.cvs_file.rcs_path, b.cvs_file.rcs_path)) svn_commit = SVNPrimaryCommit( cvs_revs, timestamp, self.revnum_generator.gen_id() ) yield svn_commit for cvs_rev in cvs_revs: Ctx()._symbolings_logger.log_revision(cvs_rev, svn_commit.revnum) # Generate an SVNPostCommit if we have default branch revs. If # some of the revisions in this commit happened on a non-trunk # default branch, then those files have to be copied into trunk # manually after being changed on the branch (because the RCS # "default branch" appears as head, i.e., trunk, in practice). # Unfortunately, Subversion doesn't support copies with sources # in the current txn. All copies must be based in committed # revisions. Therefore, we generate the copies in a new # revision. for svn_post_commit in self._post_commit( cvs_revs, svn_commit.revnum, timestamp ): yield svn_post_commit def _process_tag_changeset(self, changeset, timestamp): """Process TagChangeset CHANGESET, producing a SVNTagCommit. Filter out CVSTagNoops. If no CVSTags are left, don't generate a SVNTagCommit.""" if Ctx().trunk_only: raise InternalError( 'TagChangeset encountered during a --trunk-only conversion') cvs_tag_ids = [ cvs_tag.id for cvs_tag in changeset.iter_cvs_items() if not isinstance(cvs_tag, CVSTagNoop) ] if cvs_tag_ids: yield SVNTagCommit( changeset.symbol, cvs_tag_ids, timestamp, self.revnum_generator.gen_id(), ) else: logger.debug( 'Omitting %r because it contains only CVSTagNoops' % (changeset,) ) def _process_branch_changeset(self, changeset, timestamp): """Process BranchChangeset CHANGESET, producing a SVNBranchCommit. Filter out CVSBranchNoops. If no CVSBranches are left, don't generate a SVNBranchCommit.""" if Ctx().trunk_only: raise InternalError( 'BranchChangeset encountered during a --trunk-only conversion') cvs_branches = [ cvs_branch for cvs_branch in changeset.iter_cvs_items() if not isinstance(cvs_branch, CVSBranchNoop) ] if cvs_branches: svn_commit = SVNBranchCommit( changeset.symbol, [cvs_branch.id for cvs_branch in cvs_branches], timestamp, self.revnum_generator.gen_id(), ) yield svn_commit for cvs_branch in cvs_branches: Ctx()._symbolings_logger.log_branch_revision( cvs_branch, svn_commit.revnum ) else: logger.debug( 'Omitting %r because it contains only CVSBranchNoops' % (changeset,) ) def process_changeset(self, changeset, timestamp): """Process CHANGESET, using TIMESTAMP for all of its entries. Return a generator that generates the resulting SVNCommits. The changesets must be fed to this function in proper dependency order.""" # First create any new projects that might be opened by the # changeset: projects_opened = \ changeset.get_projects_opened() - self._initialized_projects if projects_opened: if Ctx().cross_project_commits: yield SVNInitialProjectCommit( timestamp, projects_opened, self.revnum_generator.gen_id() ) else: for project in projects_opened: yield SVNInitialProjectCommit( timestamp, [project], self.revnum_generator.gen_id() ) self._initialized_projects.update(projects_opened) if isinstance(changeset, OrderedChangeset): for svn_commit \ in self._process_revision_changeset(changeset, timestamp): yield svn_commit elif isinstance(changeset, TagChangeset): for svn_commit in self._process_tag_changeset(changeset, timestamp): yield svn_commit elif isinstance(changeset, BranchChangeset): for svn_commit in self._process_branch_changeset(changeset, timestamp): yield svn_commit else: raise TypeError('Illegal changeset %r' % changeset)
class GitOutputOption(OutputOption): """An OutputOption that outputs to a git-fast-import formatted file. Members: dump_filename -- (string) the name of the file to which the git-fast-import commands for defining revisions will be written. author_transforms -- a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents are 8-bit strings encoded as UTF-8. """ # The first mark number used for git-fast-import commit marks. This # value needs to be large to avoid conflicts with blob marks. _first_commit_mark = 1000000000 def __init__( self, dump_filename, revision_writer, max_merges=None, author_transforms=None, ): """Constructor. DUMP_FILENAME is the name of the file to which the git-fast-import commands for defining revisions should be written. (Please note that depending on the style of revision writer, the actual file contents might not be written to this file.) REVISION_WRITER is a GitRevisionWriter that is used to output either the content of revisions or a mark that was previously used to label a blob. MAX_MERGES can be set to an integer telling the maximum number of parents that can be merged into a commit at once (aside from the natural parent). If it is set to None, then there is no limit. AUTHOR_TRANSFORMS is a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents should either be Unicode strings or 8-bit strings encoded as UTF-8. """ self.dump_filename = dump_filename self.revision_writer = revision_writer self.max_merges = max_merges def to_utf8(s): if isinstance(s, unicode): return s.encode('utf8') else: return s self.author_transforms = {} if author_transforms is not None: for (cvsauthor, ( name, email, )) in author_transforms.iteritems(): cvsauthor = to_utf8(cvsauthor) name = to_utf8(name) email = to_utf8(email) self.author_transforms[cvsauthor] = ( name, email, ) self._mirror = RepositoryMirror() self._mark_generator = KeyGenerator(GitOutputOption._first_commit_mark) def register_artifacts(self, which_pass): # These artifacts are needed for SymbolingsReader: artifact_manager.register_temp_file_needed( config.SYMBOL_OPENINGS_CLOSINGS_SORTED, which_pass) artifact_manager.register_temp_file_needed(config.SYMBOL_OFFSETS_DB, which_pass) self.revision_writer.register_artifacts(which_pass) self._mirror.register_artifacts(which_pass) def check(self): if Ctx().cross_project_commits: raise FatalError( 'Git output is not supported with cross-project commits') if Ctx().cross_branch_commits: raise FatalError( 'Git output is not supported with cross-branch commits') if Ctx().username is None: raise FatalError('Git output requires a default commit username') def check_symbols(self, symbol_map): # FIXME: What constraints does git impose on symbols? pass def setup(self, svn_rev_count): self._symbolings_reader = SymbolingsReader() self.f = open(self.dump_filename, 'wb') # The youngest revnum that has been committed so far: self._youngest = 0 # A map {lod : [(revnum, mark)]} giving each of the revision # numbers in which there was a commit to lod, and the mark active # at the end of the revnum. self._marks = {} self._mirror.open() self.revision_writer.start(self.f, self._mirror) def _create_commit_mark(self, lod, revnum): mark = self._mark_generator.gen_id() self._set_lod_mark(lod, revnum, mark) return mark def _set_lod_mark(self, lod, revnum, mark): """Record MARK as the status of LOD for REVNUM. If there is already an entry for REVNUM, overwrite it. If not, append a new entry to the self._marks list for LOD.""" assert revnum >= self._youngest entry = (revnum, mark) try: modifications = self._marks[lod] except KeyError: # This LOD hasn't appeared before; create a new list and add the # entry: self._marks[lod] = [entry] else: # A record exists, so it necessarily has at least one element: if modifications[-1][0] == revnum: modifications[-1] = entry else: modifications.append(entry) self._youngest = revnum def _get_author(self, svn_commit): """Return the author to be used for SVN_COMMIT. Return the author in the form needed by git; that is, 'foo <bar>'.""" author = svn_commit.get_author() ( name, email, ) = self.author_transforms.get(author, ( author, author, )) return '%s <%s>' % ( name, email, ) @staticmethod def _get_log_msg(svn_commit): return svn_commit.get_log_msg() def process_initial_project_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) self._mirror.end_commit() def process_primary_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) lods = set() for cvs_rev in svn_commit.get_cvs_items(): lods.add(cvs_rev.lod) if len(lods) != 1: raise InternalError('Commit affects %d LODs' % (len(lods), )) lod = lods.pop() self._mirror.start_commit(svn_commit.revnum) if isinstance(lod, Trunk): # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') else: self.f.write('commit refs/heads/%s\n' % (lod.name, )) self.f.write('mark :%d\n' % (self._create_commit_mark(lod, svn_commit.revnum), )) self.f.write('committer %s %d +0000\n' % ( author, svn_commit.date, )) self.f.write('data %d\n' % (len(log_msg), )) self.f.write('%s\n' % (log_msg, )) for cvs_rev in svn_commit.get_cvs_items(): self.revision_writer.process_revision(cvs_rev, post_commit=False) self.f.write('\n') self._mirror.end_commit() def process_post_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) source_lods = set() for cvs_rev in svn_commit.cvs_revs: source_lods.add(cvs_rev.lod) if len(source_lods) != 1: raise InternalError('Commit is from %d LODs' % (len(source_lods), )) source_lod = source_lods.pop() self._mirror.start_commit(svn_commit.revnum) # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') self.f.write('mark :%d\n' % (self._create_commit_mark(None, svn_commit.revnum), )) self.f.write('committer %s %d +0000\n' % ( author, svn_commit.date, )) self.f.write('data %d\n' % (len(log_msg), )) self.f.write('%s\n' % (log_msg, )) self.f.write('merge :%d\n' % (self._get_source_mark(source_lod, svn_commit.revnum), )) for cvs_rev in svn_commit.cvs_revs: self.revision_writer.process_revision(cvs_rev, post_commit=True) self.f.write('\n') self._mirror.end_commit() def _get_source_groups(self, svn_commit): """Return groups of sources for SVN_COMMIT. SVN_COMMIT is an instance of SVNSymbolCommit. Yield tuples (source_lod, svn_revnum, cvs_symbols) where source_lod is the line of development and svn_revnum is the revision that should serve as a source, and cvs_symbols is a list of CVSSymbolItems that can be copied from that source. The groups are returned in arbitrary order.""" # Get a map {CVSSymbol : SVNRevisionRange}: range_map = self._symbolings_reader.get_range_map(svn_commit) # range_map, split up into one map per LOD; i.e., {LOD : # {CVSSymbol : SVNRevisionRange}}: lod_range_maps = {} for (cvs_symbol, range) in range_map.iteritems(): lod_range_map = lod_range_maps.get(range.source_lod) if lod_range_map is None: lod_range_map = {} lod_range_maps[range.source_lod] = lod_range_map lod_range_map[cvs_symbol] = range # Sort the sources so that the branch that serves most often as # parent is processed first: lod_ranges = lod_range_maps.items() lod_ranges.sort( lambda (lod1, lod_range_map1), (lod2, lod_range_map2): -cmp( len(lod_range_map1), len(lod_range_map2)) or cmp(lod1, lod2)) for (lod, lod_range_map) in lod_ranges: while lod_range_map: revision_scores = RevisionScores(lod_range_map.values()) (source_lod, revnum, score) = revision_scores.get_best_revnum() assert source_lod == lod cvs_symbols = [] for (cvs_symbol, range) in lod_range_map.items(): if revnum in range: cvs_symbols.append(cvs_symbol) del lod_range_map[cvs_symbol] yield (lod, revnum, cvs_symbols) def _get_all_files(self, node): """Generate all of the CVSFiles under NODE.""" for cvs_path in node: subnode = node[cvs_path] if subnode is None: yield cvs_path else: for sub_cvs_path in self._get_all_files(subnode): yield sub_cvs_path def _is_simple_copy(self, svn_commit, source_groups): """Return True iff SVN_COMMIT can be created as a simple copy. SVN_COMMIT is an SVNTagCommit. Return True iff it can be created as a simple copy from an existing revision (i.e., if the fixup branch can be avoided for this tag creation).""" # The first requirement is that there be exactly one source: if len(source_groups) != 1: return False (source_lod, svn_revnum, cvs_symbols) = source_groups[0] # The second requirement is that the destination LOD not already # exist: try: self._mirror.get_current_lod_directory(svn_commit.symbol) except KeyError: # The LOD doesn't already exist. This is good. pass else: # The LOD already exists. It cannot be created by a copy. return False # The third requirement is that the source LOD contains exactly # the same files as we need to add to the symbol: try: source_node = self._mirror.get_old_lod_directory( source_lod, svn_revnum) except KeyError: raise InternalError('Source %r does not exist' % (source_lod, )) return (set([cvs_symbol.cvs_file for cvs_symbol in cvs_symbols ]) == set(self._get_all_files(source_node))) def _get_source_mark(self, source_lod, revnum): """Return the mark active on SOURCE_LOD at the end of REVNUM.""" modifications = self._marks[source_lod] i = bisect.bisect_left(modifications, (revnum + 1, )) - 1 (revnum, mark) = modifications[i] return mark def _process_symbol_commit(self, svn_commit, git_branch, source_groups, mark): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) self.f.write('commit %s\n' % (git_branch, )) self.f.write('mark :%d\n' % (mark, )) self.f.write('committer %s %d +0000\n' % ( author, svn_commit.date, )) self.f.write('data %d\n' % (len(log_msg), )) self.f.write('%s\n' % (log_msg, )) for ( source_lod, source_revnum, cvs_symbols, ) in source_groups: self.f.write('merge :%d\n' % (self._get_source_mark(source_lod, source_revnum), )) for ( source_lod, source_revnum, cvs_symbols, ) in source_groups: for cvs_symbol in cvs_symbols: self.revision_writer.branch_file(cvs_symbol) self.f.write('\n') def process_branch_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) source_groups = list(self._get_source_groups(svn_commit)) for groups in get_chunks(source_groups, self.max_merges): self._process_symbol_commit( svn_commit, 'refs/heads/%s' % (svn_commit.symbol.name, ), groups, self._create_commit_mark(svn_commit.symbol, svn_commit.revnum), ) self._mirror.end_commit() def _set_symbol(self, symbol, mark): if isinstance(symbol, Branch): category = 'heads' elif isinstance(symbol, Tag): category = 'tags' else: raise InternalError() self.f.write('reset refs/%s/%s\n' % ( category, symbol.name, )) self.f.write('from :%d\n' % (mark, )) def process_tag_commit(self, svn_commit): # FIXME: For now we create a fixup branch with the same name as # the tag, then the tag. We never delete the fixup branch. Also, # a fixup branch is created even if the tag could be created from # a single source. self._mirror.start_commit(svn_commit.revnum) source_groups = list(self._get_source_groups(svn_commit)) if self._is_simple_copy(svn_commit, source_groups): (source_lod, source_revnum, cvs_symbols) = source_groups[0] Log().debug('%s will be created via a simple copy from %s:r%d' % ( svn_commit.symbol, source_lod, source_revnum, )) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) else: Log().debug('%s will be created via a fixup branch' % (svn_commit.symbol, )) # Create the fixup branch (which might involve making more than # one commit): for groups in get_chunks(source_groups, self.max_merges): mark = self._create_commit_mark(svn_commit.symbol, svn_commit.revnum) self._process_symbol_commit(svn_commit, FIXUP_BRANCH_NAME, groups, mark) # Store the mark of the last commit to the fixup branch as the # value of the tag: self._set_symbol(svn_commit.symbol, mark) self.f.write('reset %s\n' % (FIXUP_BRANCH_NAME, )) self.f.write('\n') self._mirror.end_commit() def cleanup(self): self.revision_writer.finish() self._mirror.close() self.f.close() del self.f self._symbolings_reader.close() del self._symbolings_reader
class GitRevisionRecorder(FulltextRevisionRecorder): """Output file revisions to git-fast-import.""" def __init__(self, blob_filename): self.blob_filename = blob_filename def start(self): self.dump_file = open(self.blob_filename, 'wb') self._mark_generator = KeyGenerator() def start_file(self, cvs_file_items): self._cvs_file_items = cvs_file_items def _get_original_source(self, cvs_rev): """Return the first CVSRevision with the content of CVS_REV. 'First' here refers to deltatext order; i.e., the very first revision is HEAD on trunk, then backwards to the root of a branch, then out to the tip of a branch. If there is no other CVSRevision that have the same content, return CVS_REV itself.""" while True: if cvs_rev.deltatext_exists: return cvs_rev if isinstance(cvs_rev.lod, Trunk): if cvs_rev.next_id is None: # The HEAD revision on trunk is always its own source, even # if its deltatext (i.e., its fulltext) is empty: return cvs_rev else: cvs_rev = self._cvs_file_items[cvs_rev.next_id] else: cvs_rev = self._cvs_file_items[cvs_rev.prev_id] def record_fulltext(self, cvs_rev, log, fulltext): """Write the fulltext to a blob if it is original. To find the 'original' revision, we follow the CVS delta-dependency chain backwards until we find a file that has a deltatext. The reason we go to this trouble is to avoid writing the same file contents multiple times for a string of revisions that don't have deltatexts (as, for example, happens with dead revisions and imported revisions).""" source = self._get_original_source(cvs_rev) if source.id == cvs_rev.id: # Revision is its own source; write it out: mark = self._mark_generator.gen_id() self.dump_file.write('blob\n') self.dump_file.write('mark :%d\n' % (mark,)) self.dump_file.write('data %d\n' % (len(fulltext),)) self.dump_file.write(fulltext) self.dump_file.write('\n') return mark else: # Return as revision_recorder_token the CVSRevision.id of the # original source revision: return source.revision_recorder_token def finish_file(self, cvs_file_items): # Determine the original source of each CVSSymbol, and store it as # the symbol's revision_recorder_token. for cvs_item in cvs_file_items.values(): if isinstance(cvs_item, CVSSymbol): cvs_source = cvs_file_items[cvs_item.source_id] while not isinstance(cvs_source, CVSRevision): cvs_source = cvs_file_items[cvs_source.source_id] cvs_item.revision_recorder_token = cvs_source.revision_recorder_token del self._cvs_file_items def finish(self): self.dump_file.close()
class GitOutputOption(OutputOption): """An OutputOption that outputs to a git-fast-import formatted file. Members: dump_filename -- (string) the name of the file to which the git-fast-import commands for defining revisions will be written. author_transforms -- a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents are 8-bit strings encoded as UTF-8. """ # The first mark number used for git-fast-import commit marks. This # value needs to be large to avoid conflicts with blob marks. _first_commit_mark = 1000000000 def __init__( self, dump_filename, revision_writer, max_merges=None, author_transforms=None, ): """Constructor. DUMP_FILENAME is the name of the file to which the git-fast-import commands for defining revisions should be written. (Please note that depending on the style of revision writer, the actual file contents might not be written to this file.) REVISION_WRITER is a GitRevisionWriter that is used to output either the content of revisions or a mark that was previously used to label a blob. MAX_MERGES can be set to an integer telling the maximum number of parents that can be merged into a commit at once (aside from the natural parent). If it is set to None, then there is no limit. AUTHOR_TRANSFORMS is a map {cvsauthor : (fullname, email)} from CVS author names to git full name and email address. All of the contents should either be Unicode strings or 8-bit strings encoded as UTF-8. """ self.dump_filename = dump_filename self.revision_writer = revision_writer self.max_merges = max_merges def to_utf8(s): if isinstance(s, unicode): return s.encode('utf8') else: return s self.author_transforms = {} if author_transforms is not None: for (cvsauthor, (name, email,)) in author_transforms.iteritems(): cvsauthor = to_utf8(cvsauthor) name = to_utf8(name) email = to_utf8(email) self.author_transforms[cvsauthor] = (name, email,) self._mirror = RepositoryMirror() self._mark_generator = KeyGenerator(GitOutputOption._first_commit_mark) def register_artifacts(self, which_pass): # These artifacts are needed for SymbolingsReader: artifact_manager.register_temp_file_needed( config.SYMBOL_OPENINGS_CLOSINGS_SORTED, which_pass ) artifact_manager.register_temp_file_needed( config.SYMBOL_OFFSETS_DB, which_pass ) self.revision_writer.register_artifacts(which_pass) self._mirror.register_artifacts(which_pass) def check(self): if Ctx().cross_project_commits: raise FatalError( 'Git output is not supported with cross-project commits' ) if Ctx().cross_branch_commits: raise FatalError( 'Git output is not supported with cross-branch commits' ) if Ctx().username is None: raise FatalError( 'Git output requires a default commit username' ) def check_symbols(self, symbol_map): # FIXME: What constraints does git impose on symbols? pass def setup(self, svn_rev_count): self._symbolings_reader = SymbolingsReader() self.f = open(self.dump_filename, 'wb') # The youngest revnum that has been committed so far: self._youngest = 0 # A map {lod : [(revnum, mark)]} giving each of the revision # numbers in which there was a commit to lod, and the mark active # at the end of the revnum. self._marks = {} self._mirror.open() self.revision_writer.start(self.f, self._mirror) def _create_commit_mark(self, lod, revnum): mark = self._mark_generator.gen_id() self._set_lod_mark(lod, revnum, mark) return mark def _set_lod_mark(self, lod, revnum, mark): """Record MARK as the status of LOD for REVNUM. If there is already an entry for REVNUM, overwrite it. If not, append a new entry to the self._marks list for LOD.""" assert revnum >= self._youngest entry = (revnum, mark) try: modifications = self._marks[lod] except KeyError: # This LOD hasn't appeared before; create a new list and add the # entry: self._marks[lod] = [entry] else: # A record exists, so it necessarily has at least one element: if modifications[-1][0] == revnum: modifications[-1] = entry else: modifications.append(entry) self._youngest = revnum def _get_author(self, svn_commit): """Return the author to be used for SVN_COMMIT. Return the author in the form needed by git; that is, 'foo <bar>'.""" author = svn_commit.get_author() (name, email,) = self.author_transforms.get(author, (author, author,)) return '%s <%s>' % (name, email,) @staticmethod def _get_log_msg(svn_commit): return svn_commit.get_log_msg() def process_initial_project_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) self._mirror.end_commit() def process_primary_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) lods = set() for cvs_rev in svn_commit.get_cvs_items(): lods.add(cvs_rev.lod) if len(lods) != 1: raise InternalError('Commit affects %d LODs' % (len(lods),)) lod = lods.pop() self._mirror.start_commit(svn_commit.revnum) if isinstance(lod, Trunk): # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') else: self.f.write('commit refs/heads/%s\n' % (lod.name,)) self.f.write( 'mark :%d\n' % (self._create_commit_mark(lod, svn_commit.revnum),) ) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) for cvs_rev in svn_commit.get_cvs_items(): self.revision_writer.process_revision(cvs_rev, post_commit=False) self.f.write('\n') self._mirror.end_commit() def process_post_commit(self, svn_commit): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) source_lods = set() for cvs_rev in svn_commit.cvs_revs: source_lods.add(cvs_rev.lod) if len(source_lods) != 1: raise InternalError('Commit is from %d LODs' % (len(source_lods),)) source_lod = source_lods.pop() self._mirror.start_commit(svn_commit.revnum) # FIXME: is this correct?: self.f.write('commit refs/heads/master\n') self.f.write( 'mark :%d\n' % (self._create_commit_mark(None, svn_commit.revnum),) ) self.f.write( 'committer %s %d +0000\n' % (author, svn_commit.date,) ) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) self.f.write( 'merge :%d\n' % (self._get_source_mark(source_lod, svn_commit.revnum),) ) for cvs_rev in svn_commit.cvs_revs: self.revision_writer.process_revision(cvs_rev, post_commit=True) self.f.write('\n') self._mirror.end_commit() def _get_source_groups(self, svn_commit): """Return groups of sources for SVN_COMMIT. SVN_COMMIT is an instance of SVNSymbolCommit. Yield tuples (source_lod, svn_revnum, cvs_symbols) where source_lod is the line of development and svn_revnum is the revision that should serve as a source, and cvs_symbols is a list of CVSSymbolItems that can be copied from that source. The groups are returned in arbitrary order.""" # Get a map {CVSSymbol : SVNRevisionRange}: range_map = self._symbolings_reader.get_range_map(svn_commit) # range_map, split up into one map per LOD; i.e., {LOD : # {CVSSymbol : SVNRevisionRange}}: lod_range_maps = {} for (cvs_symbol, range) in range_map.iteritems(): lod_range_map = lod_range_maps.get(range.source_lod) if lod_range_map is None: lod_range_map = {} lod_range_maps[range.source_lod] = lod_range_map lod_range_map[cvs_symbol] = range # Sort the sources so that the branch that serves most often as # parent is processed first: lod_ranges = lod_range_maps.items() lod_ranges.sort( lambda (lod1,lod_range_map1),(lod2,lod_range_map2): -cmp(len(lod_range_map1), len(lod_range_map2)) or cmp(lod1, lod2) ) for (lod, lod_range_map) in lod_ranges: while lod_range_map: revision_scores = RevisionScores(lod_range_map.values()) (source_lod, revnum, score) = revision_scores.get_best_revnum() assert source_lod == lod cvs_symbols = [] for (cvs_symbol, range) in lod_range_map.items(): if revnum in range: cvs_symbols.append(cvs_symbol) del lod_range_map[cvs_symbol] yield (lod, revnum, cvs_symbols) def _get_all_files(self, node): """Generate all of the CVSFiles under NODE.""" for cvs_path in node: subnode = node[cvs_path] if subnode is None: yield cvs_path else: for sub_cvs_path in self._get_all_files(subnode): yield sub_cvs_path def _is_simple_copy(self, svn_commit, source_groups): """Return True iff SVN_COMMIT can be created as a simple copy. SVN_COMMIT is an SVNTagCommit. Return True iff it can be created as a simple copy from an existing revision (i.e., if the fixup branch can be avoided for this tag creation).""" # The first requirement is that there be exactly one source: if len(source_groups) != 1: return False (source_lod, svn_revnum, cvs_symbols) = source_groups[0] # The second requirement is that the destination LOD not already # exist: try: self._mirror.get_current_lod_directory(svn_commit.symbol) except KeyError: # The LOD doesn't already exist. This is good. pass else: # The LOD already exists. It cannot be created by a copy. return False # The third requirement is that the source LOD contains exactly # the same files as we need to add to the symbol: try: source_node = self._mirror.get_old_lod_directory(source_lod, svn_revnum) except KeyError: raise InternalError('Source %r does not exist' % (source_lod,)) return ( set([cvs_symbol.cvs_file for cvs_symbol in cvs_symbols]) == set(self._get_all_files(source_node)) ) def _get_source_mark(self, source_lod, revnum): """Return the mark active on SOURCE_LOD at the end of REVNUM.""" modifications = self._marks[source_lod] i = bisect.bisect_left(modifications, (revnum + 1,)) - 1 (revnum, mark) = modifications[i] return mark def _process_symbol_commit( self, svn_commit, git_branch, source_groups, mark ): author = self._get_author(svn_commit) log_msg = self._get_log_msg(svn_commit) self.f.write('commit %s\n' % (git_branch,)) self.f.write('mark :%d\n' % (mark,)) self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) self.f.write('data %d\n' % (len(log_msg),)) self.f.write('%s\n' % (log_msg,)) for (source_lod, source_revnum, cvs_symbols,) in source_groups: self.f.write( 'merge :%d\n' % (self._get_source_mark(source_lod, source_revnum),) ) for (source_lod, source_revnum, cvs_symbols,) in source_groups: for cvs_symbol in cvs_symbols: self.revision_writer.branch_file(cvs_symbol) self.f.write('\n') def process_branch_commit(self, svn_commit): self._mirror.start_commit(svn_commit.revnum) source_groups = list(self._get_source_groups(svn_commit)) for groups in get_chunks(source_groups, self.max_merges): self._process_symbol_commit( svn_commit, 'refs/heads/%s' % (svn_commit.symbol.name,), groups, self._create_commit_mark(svn_commit.symbol, svn_commit.revnum), ) self._mirror.end_commit() def _set_symbol(self, symbol, mark): if isinstance(symbol, Branch): category = 'heads' elif isinstance(symbol, Tag): category = 'tags' else: raise InternalError() self.f.write('reset refs/%s/%s\n' % (category, symbol.name,)) self.f.write('from :%d\n' % (mark,)) def process_tag_commit(self, svn_commit): # FIXME: For now we create a fixup branch with the same name as # the tag, then the tag. We never delete the fixup branch. Also, # a fixup branch is created even if the tag could be created from # a single source. self._mirror.start_commit(svn_commit.revnum) source_groups = list(self._get_source_groups(svn_commit)) if self._is_simple_copy(svn_commit, source_groups): (source_lod, source_revnum, cvs_symbols) = source_groups[0] Log().debug( '%s will be created via a simple copy from %s:r%d' % (svn_commit.symbol, source_lod, source_revnum,) ) mark = self._get_source_mark(source_lod, source_revnum) self._set_symbol(svn_commit.symbol, mark) else: Log().debug( '%s will be created via a fixup branch' % (svn_commit.symbol,) ) # Create the fixup branch (which might involve making more than # one commit): for groups in get_chunks(source_groups, self.max_merges): mark = self._create_commit_mark(svn_commit.symbol, svn_commit.revnum) self._process_symbol_commit( svn_commit, FIXUP_BRANCH_NAME, groups, mark ) # Store the mark of the last commit to the fixup branch as the # value of the tag: self._set_symbol(svn_commit.symbol, mark) self.f.write('reset %s\n' % (FIXUP_BRANCH_NAME,)) self.f.write('\n') self._mirror.end_commit() def cleanup(self): self.revision_writer.finish() self._mirror.close() self.f.close() del self.f self._symbolings_reader.close() del self._symbolings_reader