def get_self_paced_courses(cfg: Dict[str, str]) -> Sequence[str]: """ Find the names of all self-paced courses by querying the local Git repo clone. :param cfg the loaded config. COURSE_REPO and SELF_PACED_PATH must be set :return: the names of all self-paced courses (as simple directory names) """ self_paced_path = cfg['SELF_PACED_PATH'] for rel_path in self_paced_path.split(':'): self_paced_dir = os.path.join(cfg['COURSE_REPO'], rel_path) if not os.path.isdir(self_paced_dir): debug(f'Directory "{self_paced_dir}" (in SELF_PACED_PATH) ' + 'does not exist.') continue for f in os.listdir(self_paced_dir): if f[0] == '.': continue full_path = os.path.join(self_paced_dir, f) if not os.path.isdir(full_path): continue for course_file in os.listdir(full_path): p = os.path.join(full_path, course_file) if os.path.isdir(p): continue _, ext = os.path.splitext(course_file) if course_file.startswith("build") and ext == '.yaml': yield f break
def _parse_source_notebook(path: str, encoding: str) -> Notebook: """ Parse a Databricks source notebook into a Notebook object. :param path: the path to the notebook :param encoding: the encoding to use when reading the file :returns: a parsed Notebook object :raises NotebookParseError: if the notebook cannot be parsed :raises NotebookError: other errors (e.g., invalid file type) """ language = NotebookLanguage.from_path(path) comment_string = COMMENT_STRINGS[language] leading_comment = _leading_comment_pattern(comment_string) header = _notebook_header_re(comment_string) magic = re.compile(r"{}\s+MAGIC\s?([^\s]*)(.*)$".format(leading_comment)) new_cell = re.compile(r"{}\s+COMMAND\s+-+.*$".format(leading_comment)) def check_for_header(line): if not header.search(line): raise NotebookParseError( f'File "{path}" is missing expected Databricks header') cells = [] cur_cell = EmptyCell command_buf = [] lines = _read_notebook(path, encoding) if len(lines) == 0: raise NotebookParseError(f'File "{path}" is empty.') saw_new_cell = False check_for_header(lines[0]) saw_new_cell = True skip_next = False for i, line in enumerate(lines[1:]): line_num = i + 2 # account for skipped header if skip_next: debug(f'"{path}", line {line_num}: Skipping...') skip_next = False continue # If this line matches the start of a new cell marker, save the # existing cell and reset all the variables. if new_cell.search(line): debug(f'"{path}", line {line_num}: New command') if cur_cell != EmptyCell: # The last line of any cell should be blank and should # be removed, as it is really just a separator before the # marker starting a new cell. if len(command_buf[-1].strip()) == 0: command_buf = command_buf[:-1] cells.append( dataclasses.replace(cur_cell, command="\n".join(command_buf))) cur_cell = EmptyCell saw_new_cell = True skip_next = True command_buf = [] continue # Is this cell a "MAGIC" cell? If so, extract the contents without # the leading MAGIC indicator. m = magic.search(line) if m: line = f"{m.group(1)}{m.group(2)}" # If we didn't see the new cell marker, then keep accumulating the # current cell and move on to the next line. if not saw_new_cell: debug(f'"{path}", line {line_num}: Not first line') command_buf.append(line) continue # Start of a new cell requires additional processing. saw_new_cell = False if not m: # Not a magic line. It is, therefore, a code cell of the same # type as the base language of the notebook. debug(f"{path}, line {line_num}: No magic") command_buf.append(line) cur_cell = dataclasses.replace( cur_cell, cell_type=CellType.from_language(language), marked_magic=False) continue # Magic line as first line in cell. If it's an empty magic line, skip it. token = m.group(1).strip() if not token: debug(f'"{path}", line {line_num}: Skipping empty magic.') continue # Extract cell type, if it exists. debug(f'"{path}", line {line_num}: Magic') if (not token) or (token[0] != "%"): raise NotebookParseError( f'"{path}", line {line_num}: Bad first magic cell line: {line}' ) command_buf.append(line) cur_cell = dataclasses.replace( cur_cell, cell_type=CellType.from_string(token), ) # If there's an unfinished cell left, finish it. if cur_cell != EmptyCell: cells.append( dataclasses.replace(cur_cell, command="\n".join(command_buf))) cells = [ dataclasses.replace(cell, position=i + 1, guid=uuid.uuid4()) for i, cell in enumerate(cells) ] return Notebook(cells=cells, path=path)