def handle(self, *args, **options): log = Logger(path=__file__, force_verbose=options.get('verbose'), force_silent=options.get('silent')) input = Input(path=__file__) workshops = get_all_existing_workshops() if options.get('name'): workshops = get_all_existing_workshops(options.get('name')) for _ in workshops: slug, path = _ DATAFILE = f'{path}/{slug}.yml' d = get_yaml(DATAFILE, log=log) # Separate out data imagedata = d.get('image') frontmatterdata = d.get('sections').get('frontmatter') praxisdata = d.get('sections').get('theory-to-practice') lessondata = d.get('sections').get('lessons') full_name = d.get('name') parent_backend = d.get('parent_backend') parent_branch = d.get('parent_branch') parent_repo = d.get('parent_repo') # 1. ENTER WORKSHOP workshop, created = Workshop.objects.update_or_create( name=full_name, slug=dhri_slugify(full_name), defaults={ 'parent_backend': parent_backend, 'parent_branch': parent_branch, 'parent_repo': parent_repo, 'image_alt': imagedata['alt'] }) def _get_valid_name(filename): return filename.replace( '@', '') # TODO: should exist a built-in for django here? def _get_media_path(valid_filename): return settings.MEDIA_ROOT + '/' + Workshop.image.field.upload_to + valid_filename def _get_media_url(valid_filename): return Workshop.image.field.upload_to + valid_filename def _image_exists(valid_filename): media_path = _get_media_path(valid_filename) return os.path.exists(media_path) def _get_default_image(): return Workshop.image.field.default if imagedata: source_file = imagedata['url'] valid_filename = _get_valid_name( slug + '-' + os.path.basename(imagedata['url'])) if not _image_exists(valid_filename) or filecmp.cmp( source_file, _get_media_path(valid_filename), shallow=False) == False: try: with open(source_file, 'rb') as f: workshop.image = File(f, name=valid_filename) workshop.save() except FileNotFoundError: log.error( f'File `{source_file}` could not be found. Did you run `python manage.py buildworkshop` before you ran this command?' ) workshop.image.name = _get_media_url(valid_filename) workshop.save() else: log.warning( f'Workshop {workshop.name} does not have an image assigned to it. Add filepaths to an existing file in your datafile ({DATAFILE}) if you want to update the specific workshop. Default workshop image (`{os.path.basename(_get_default_image())}`) will be assigned.' ) workshop.image.name = Workshop.image.field.default workshop.save() if not _image_exists( _get_valid_name(os.path.basename( _get_default_image()))): log.warning( f'Default workshop image does not exist. You will want to add it manually to the correct folder: {_get_media_path("")}' ) # Saving the slug in a format that matches the GitHub repositories (special method `save_slug`) workshop.slug = slug workshop.save_slug() # 2. ENTER FRONTMATTER frontmatter, created = Frontmatter.objects.update_or_create( workshop=workshop, defaults={ 'abstract': frontmatterdata.get('abstract'), 'estimated_time': frontmatterdata.get('estimated_time') }) if frontmatterdata.get('ethical_considerations'): for point in frontmatterdata.get('ethical_considerations'): _, created = EthicalConsideration.objects.update_or_create( frontmatter=frontmatter, label=point.get('annotation')) if frontmatterdata.get('learning_objectives'): for point in frontmatterdata.get('learning_objectives'): _, created = LearningObjective.objects.update_or_create( frontmatter=frontmatter, label=point.get('annotation')) for cat in ['projects', 'readings', 'cheat_sheets', 'datasets']: if frontmatterdata.get(cat): category, add_field = None, None if cat == 'projects': category = Resource.PROJECT add_field = frontmatter.projects elif cat == 'readings': category = Resource.READING add_field = frontmatter.readings elif cat == 'cheat_sheets': category = Resource.CHEATSHEET add_field = frontmatter.cheat_sheets elif cat == 'datasets': category = Resource.DATASET add_field = frontmatter.datasets for point in frontmatterdata.get(cat): if not add_field or not category: log.error( 'Cannot interpret category `{cat}`. Make sure the script is correct and corresponds with the database structure.' ) obj, created = Resource.objects.update_or_create( category=category, title=point.get('linked_text'), url=point.get('url'), annotation=point.get('annotation')) if obj not in add_field.all(): add_field.add(obj) if frontmatterdata.get('contributors'): for point in frontmatterdata.get('contributors'): profile = None try: profile = Profile.objects.get( user__first_name=point.get('first_name'), user__last_name=point.get('last_name')) except: for p in Profile.objects.all(): if f'{p.user.first_name} {p.user.last_name}' == point.get( 'full_name'): profile = p log.info( f'In-depth search revealed a profile matching the full name for `{workshop.name}` contributor `{point.get("first_name")} {point.get("last_name")}`. It may or may not be the correct person, so make sure you verify it manually.' ) if not p: log.info( f'Could not find user profile on the curriculum website for contributor `{point.get("full_name")}` (searching by first name `{point.get("first_name")}` and last name `{point.get("last_name")}`).' ) contributor, created = Contributor.objects.update_or_create( first_name=point.get('first_name'), last_name=point.get('last_name'), defaults={ 'url': point.get('link'), 'profile': profile }) collaboration, created = Collaboration.objects.update_or_create( frontmatter=frontmatter, contributor=contributor, defaults={ 'current': point.get('current'), 'role': point.get('role') }) # 3. ENTER PRAXIS praxis, created = Praxis.objects.update_or_create( workshop=workshop, defaults={ 'intro': praxisdata.get('intro'), }) for cat in ['discussion_questions', 'next_steps']: if praxisdata.get(cat): obj = None if cat == 'discussion_questions': obj = DiscussionQuestion elif cat == 'next_steps': obj = NextStep for order, point in enumerate( praxisdata[cat], start=1 ): # TODO: Should we pull out order manually here? Not necessary, right? obj.objects.update_or_create( praxis=praxis, label=point.get('annotation'), defaults={'order': order}) for cat in ['further_readings', 'further_projects', 'tutorials']: if praxisdata.get(cat): category, add_field = None, None if cat == 'further_readings': category = Resource.READING add_field = praxis.further_readings elif cat == 'further_projects': category = Resource.PROJECT add_field = praxis.further_projects elif cat == 'tutorials': category = Resource.TUTORIAL add_field = praxis.tutorials for point in praxisdata.get(cat): if not add_field or not category: log.error( 'Cannot interpret category `{cat}`. Make sure the script is correct and corresponds with the database structure.' ) try: obj, created = Resource.objects.update_or_create( category=category, title=point.get('linked_text'), url=point.get('url'), annotation=point.get('annotation')) if obj not in add_field.all(): add_field.add(obj) except IntegrityError: obj = Resource.objects.get( category=category, title=point.get('linked_text'), url=point.get('url'), ) obj.annotation = point.get('annotation') if obj not in add_field.all(): add_field.add(obj) log.info( f'Another resource with the same URL, title, and category already existed so updated with a new annotation: **{point.get("linked_text")} (old)**\n{point.get("annotation")}\n-------\n**{obj.title} (new)**\n{obj.annotation}' ) # 4. ENTER LESSONS for lessoninfo in lessondata: lesson, created = Lesson.objects.update_or_create( workshop=workshop, title=lessoninfo.get('header'), defaults={ 'order': lessoninfo.get('order'), 'text': lessoninfo.get('content'), }) #print(lesson) for image in lessoninfo.get('lesson_images'): #print('image time!') LessonImage.objects.update_or_create(url=image.get('path'), lesson=lesson, alt=image.get('alt')) if not lessoninfo.get('challenge') and lessoninfo.get( 'solution'): log.error( f'Lesson `{lesson.title}` (in workshop {workshop}) has a solution but no challenge. Correct the files on GitHub and rerun the buildworkshop command and then re-attempt the ingestworkshop command. Alternatively, you can change the datafile content manually.' ) if lessoninfo.get('challenge'): challenge, created = Challenge.objects.update_or_create( lesson=lesson, title=lessoninfo['challenge'].get('header'), defaults={ 'text': lessoninfo['challenge'].get('content') }) if lessoninfo.get('solution'): solution, created = Solution.objects.update_or_create( challenge=challenge, title=lessoninfo['solution'].get('header'), defaults={ 'text': lessoninfo['solution'].get('content') }) if lessoninfo.get('evaluation'): evaluation, created = Evaluation.objects.get_or_create( lesson=lesson) for point in lessoninfo['evaluation'].get('content'): question, created = Question.objects.update_or_create( evaluation=evaluation, label=point.get('question')) for is_correct, answers in point.get( 'answers').items(): is_correct = is_correct == 'correct' for answertext in answers: answer, created = Answer.objects.update_or_create( question=question, label=answertext, defaults={'is_correct': is_correct}) if lessoninfo.get('keywords'): # lessoninfo['keywords'].get('header') # TODO: not doing anything with keyword header yet for keyword in lessoninfo['keywords'].get('content'): terms = Term.objects.filter(term__iexact=keyword) if terms.count() == 1: lesson.terms.add(terms[0]) elif terms.count() == 0: log.warning( f'Keyword `{keyword}` (used in lesson `{lesson.title}`, workshop `{workshop}` cannot be found in the existing glossary. Are you sure it is in the glossary and synchronized with the database? Make sure the data file for glossary is available ({GLOSSARY_FILE}) and that the term is defined in the file. Then run python manage.py ingestglossary.' ) else: log.error( f'Multiple definitions of `{keyword}` exists in the database. Try resetting the glossary and rerun python manage.py ingestglossary before you run the ingestworkshop command again.' ) log.log('Added/updated workshops: ' + ', '.join([x[0] for x in workshops])) if not options.get('no_reminder'): log.log( 'Do not forget to run `ingestprerequisites` after running the `ingestworkshop` command (without the --name flag).', color='yellow') if log._save(data='ingestworkshop', name='warnings.md', warnings=True) or log._save(data='ingestworkshop', name='logs.md', warnings=False, logs=True) or log._save( data='ingestworkshop', name='info.md', warnings=False, logs=False, info=True): log.log( f'Log files with any warnings and logging information is now available in: `{log.LOG_DIR}`', force=True)
class GitHubParser(): def __init__(self, string: str = None, log=None): if log == None: self.log = Logger(name='github-parser') else: self.log = log def convert(self, string): c = GitHubParserCache(string=string) return (c.data.get('markdown', '').strip()) def strip_from_p(self, html): soup = BeautifulSoup(html, 'lxml') if soup.p: return ''.join([str(x) for x in soup.p.children]) else: return html def _fix_link(self, tag): def find_workshop(elements): if elements[-1] == 'DHRI-Curriculum': return '{GH_CURRICULUM}' for element in elements: for workshop in [x[0] for x in AUTO_REPOS]: if workshop == element: return workshop return '' elements = tag['href'].split('/') if 'http:' in elements or 'https:' in elements: link_type = 'absolute' elif elements[0].startswith('#'): link_type = 'local' else: link_type = 'relative' raw_file = False if link_type == 'absolute': if 'DHRI-Curriculum' in elements: if 'glossary' in elements and 'terms' in elements: term = elements[-1].replace('.md', '') self.log.info( f'Found link to an **glossary term** and adding shortcut link to: curriculum.dhinstitutes.org/shortcuts/term/{term}' ) tag['href'] = f'https://curriculum.dhinstitutes.org/shortcuts/term/{term}' elif 'insights' in elements and 'pages' in elements: insight = elements[-1].replace(".md", "") self.log.info( f'Found link to an **insight** and adding shortcut link to: curriculum.dhinstitutes.org/shortcuts/insight/{insight}' ) tag['href'] = f'https://curriculum.dhinstitutes.org/shortcuts/insight/{insight}' elif 'install' in elements and 'guides' in elements: install = elements[-1].replace(".md", "") self.log.info( f'Found link to an **installation** and adding shortcut link to: curriculum.dhinstitutes.org/shortcuts/install/{install}' ) tag['href'] = f'https://curriculum.dhinstitutes.org/shortcuts/install/{install}' elif 'raw.githubusercontent.com' in elements: raw_link = '/'.join(elements) self.log.info( f'Found link to **raw file** and will not change link: {raw_link}' ) else: workshop = find_workshop(elements) if workshop == '{GH_CURRICULUM}': gh_link = '/'.join(elements) self.log.info( f'Link found to **the DHRI Curriculum on GitHub**, linking to it: {gh_link}' ) elif workshop == '': gh_link = '/'.join(elements) self.log.warning( f'Found link to workshop, which is not currently being loaded into the website, will therefore redirect to **workshop on GitHub**: {gh_link}' ) else: self.log.info( f'Found link to **workshop** which (will) exist(s) on website, so changing to that: curriculum.dhinstitutes.org/workshops/{workshop}' ) tag['href'] = f'https://curriculum.dhinstitutes.org/shortcuts/workshop/{workshop}' else: pass # print(tag['href']) return tag def fix_html(self, text): def has_children(tag): children = [] try: tag.children children = [x for x in tag.children] except: pass return children if not text: return '' multiline = False if '\n' in text: multiline = True # Make text into HTML... text = self.convert(text) text = smartypants.smartypants(text) # curly quote it soup = BeautifulSoup(text, 'lxml') for tag in soup.descendants: if tag.name == 'a': # if element.text == None: # TODO: Drop links that have no text tag = self._fix_link(tag) if not multiline: if len([x for x in soup.body.children]) == 1 and soup.body.p: # We only have one paragraph, so return the _text only_ from the p return ''.join([str(x) for x in soup.body.p.children]) else: # We have multiline html_string = ''.join( [str(x) for x in soup.html.body.children]) else: html_string = ''.join([str(x) for x in soup.html.body.children]) return html_string def quote_converter(self, string, reverse=False): """Takes a string and returns it with dumb quotes, single and double, replaced by smart quotes. Accounts for the possibility of HTML tags within the string.""" if string == None: return None if not isinstance(string, str): print('Not a string:') print(string) exit() if string == '': return string if reverse == True: string = string.replace('“', '"').replace('”', '"') string = string.replace('‘', "'").replace("’", "'") return string # Find dumb double quotes coming directly after letters or punctuation, # and replace them with right double quotes. string = re.sub(r'([a-zA-Z0-9.,?!;:\'\"])"', r'\1”', string) # Find any remaining dumb double quotes and replace them with # left double quotes. string = string.replace('"', '“') # Follow the same process with dumb/smart single quotes string = re.sub(r"([a-zA-Z0-9.,?!;:\"\'])'", r'\1’', string) string = string.replace("'", '‘') return string
def handle(self, *args, **options): log = Logger(path=__file__, force_verbose=options.get('verbose'), force_silent=options.get('silent')) input = Input(path=__file__) test_for_required_files(REQUIRED_PATHS=REQUIRED_PATHS, log=log) data = get_yaml(FULL_PATH, log=log) for termdata in data: try: term, created = Term.objects.get_or_create( term=termdata.get('term')) except IntegrityError: try: term = Term.objects.get( slug=dhri_slugify(termdata.get('term'))) except: log.error('An unknown error occurred. Try') term.term = termdata.get('term') term.explication = termdata.get('explication') term.save() if not created and not options.get('force'): choice = input.ask( f'Term `{termdata.get("term")}` already exists. Update with new definition? [y/N]' ) if choice.lower() != 'y': continue Term.objects.filter(term=termdata.get('term')).update( explication=termdata.get('explication')) term.refresh_from_db() for cat in ['tutorials', 'readings', 'cheat_sheets']: if termdata.get(cat): category, add_field = None, None if cat == 'tutorials': category = Resource.TUTORIAL add_field = term.tutorials elif cat == 'readings': category = Resource.READING add_field = term.readings elif cat == 'cheat_sheets': category = Resource.CHEATSHEET add_field = term.cheat_sheets for point in termdata.get(cat): if not add_field or not category: log.error( 'Cannot interpret category `{cat}`. Make sure the script is correct and corresponds with the database structure.' ) try: obj, created = Resource.objects.update_or_create( category=category, title=point.get('linked_text'), url=point.get('url'), annotation=point.get('annotation')) if obj not in add_field.all(): add_field.add(obj) except IntegrityError: obj = Resource.objects.get( category=category, title=point.get('linked_text'), url=point.get('url'), ) obj.annotation = point.get('annotation') if obj not in add_field.all(): add_field.add(obj) log.info( f'Another resource with the same URL, title, and category already existed so updated with a new annotation: **{point.get("linked_text")} (old)**\n{point.get("annotation")}\n-------\n**{obj.title} (new)**\n{obj.annotation}' ) log.log('Added/updated terms: ' + ', '.join([x.get('term') for x in data])) if log._save(data='ingestglossary', name='warnings.md', warnings=True) or log._save(data='ingestglossary', name='logs.md', warnings=False, logs=True) or log._save( data='ingestglossary', name='info.md', warnings=False, logs=False, info=True): log.log( f'Log files with any warnings and logging information is now available in: `{log.LOG_DIR}`', force=True)
def handle(self, *args, **options): log = Logger(path=__file__, force_verbose=options.get('verbose'), force_silent=options.get('silent')) input = Input(path=__file__) test_for_required_files(REQUIRED_PATHS=REQUIRED_PATHS, log=log) data = get_yaml(FULL_PATH, log=log) for installdata in data: for operating_system in installdata.get('instructions'): software, created = Software.objects.get_or_create( operating_system=operating_system, software=installdata.get('software')) instruction, created = Instruction.objects.update_or_create( software=software, defaults={ 'what': installdata.get('what'), 'why': installdata.get('why') }) original_file = installdata.get('image') if original_file: if instruction_image_exists(original_file) and filecmp.cmp( original_file, get_instruction_image_path(original_file), shallow=False) == True: log.log( f'Instruction image already exists. Ensuring path is in database: `{get_instruction_image_path(original_file)}`' ) instruction.image.name = get_instruction_image_path( original_file, True) instruction.save() else: with open(original_file, 'rb') as f: instruction.image = File(f, name=os.path.basename( f.name)) instruction.save() if filecmp.cmp( original_file, get_instruction_image_path(original_file)): log.info( f'Instruction image has been updated so being copied to media path: `{get_instruction_image_path(original_file)}`' ) else: log.info( f'Instruction image has been copied to media path: `{get_instruction_image_path(original_file)}`' ) else: log.warning( f'An image for `{software}` does not exist. A default image will be saved instead. If you want a particular image for the installation instructions, follow the documentation.' ) instruction.image.name = get_default_instruction_image() instruction.save() for stepdata in installdata.get('instructions').get( operating_system): step, created = Step.objects.update_or_create( instruction=instruction, order=stepdata.get('step'), defaults={ 'header': stepdata.get('header'), 'text': stepdata.get('html') }) for order, d in enumerate(stepdata.get('screenshots'), start=1): path = d['path'] alt_text = d['alt'] if os.path.exists(get_screenshot_media_path( path)) and filecmp.cmp( path, get_screenshot_media_path(path), shallow=False) == True: s, _ = Screenshot.objects.get_or_create( step=step, alt_text=alt_text, order=order) s.image = get_screenshot_media_path( path, relative_to_upload_field=True) s.save() log.log( f'Screenshot already exists: `{get_screenshot_media_path(path)}`' ) else: s, _ = Screenshot.objects.get_or_create( step=step, alt_text=alt_text, order=order) with open(path, 'rb') as f: s.image = File(f, name=os.path.basename(f.name)) s.save() if filecmp.cmp(path, get_screenshot_media_path(path), shallow=False) == False: log.log( f'Screenshot was updated so re-saved: `{get_screenshot_media_path(path)}`' ) else: log.log( f'New screenshot saved: `{get_screenshot_media_path(path)}`' ) log.log('Added/updated installation instructions: ' + ', '.join([f'{x["software"]}' for x in data])) if log._save(data='ingestinstalls', name='warnings.md', warnings=True) or log._save(data='ingestinstalls', name='logs.md', warnings=False, logs=True) or log._save( data='ingestinstalls', name='info.md', warnings=False, logs=False, info=True): log.log( f'Log files with any warnings and logging information is now available in: `{log.LOG_DIR}`', force=True)
def handle(self, *args, **options): log = Logger(path=__file__, force_verbose=options.get('verbose'), force_silent=options.get('silent')) input = Input(path=__file__) test_for_required_files(REQUIRED_PATHS=REQUIRED_PATHS, log=log) data = get_yaml(FULL_PATH, log=log) for insightdata in data: # TODO: Insights and Software are also connected in a database table (insight_insight_software) but this relationship is not developed yet. insight, created = Insight.objects.update_or_create( title=insightdata.get('insight'), defaults={ 'text': insightdata.get('introduction'), 'image_alt': insightdata.get('image').get('alt') }) original_file = insightdata.get('image').get('url') if original_file: if insight_image_exists(original_file) and filecmp.cmp( original_file, get_insight_image_path(original_file), shallow=False) == True: log.log( f'Insight image already exists. Connecting existing paths to database: `{get_insight_image_path(original_file)}`' ) insight.image.name = get_insight_image_path( original_file, True) insight.save() else: with open(original_file, 'rb') as f: insight.image = File(f, name=self.os.path.basename( f.name)) insight.save() if filecmp.cmp(original_file, get_insight_image_path(original_file), shallow=False): log.info( f'Insight image has been updated and thus was copied to the media path: `{get_insight_image_path(original_file)}`' ) else: log.info( f'Insight image was not found and is copied to media path: `{get_insight_image_path(original_file)}`' ) else: log.warning( f'An image for `{insight}` does not exist. A default image will be saved instead. If you want a particular image for the installation instructions, follow the documentation.' ) insight.image.name = get_default_insight_image() insight.save() for sectiondata in insightdata.get('sections', []): title = sectiondata sectiondata = insightdata.get('sections').get(sectiondata) section, created = Section.objects.update_or_create( insight=insight, title=title, defaults={ 'order': sectiondata.get('order'), 'text': sectiondata.get('content') }) for operating_system, osdata in insightdata.get( 'os_specific').items(): related_section = Section.objects.get( title=osdata.get('related_section')) OperatingSystemSpecificSection.objects.update_or_create( section=related_section, operating_system=operating_system, defaults={'text': osdata.get('content')}) log.log('Added/updated insights: ' + ', '.join([x.get("insight") for x in data])) if log._save(data='ingestinsights', name='warnings.md', warnings=True) or log._save(data='ingestinsights', name='logs.md', warnings=False, logs=True) or log._save( data='ingestinsights', name='info.md', warnings=False, logs=False, info=True): log.log( f'Log files with any warnings and logging information is now available in: `{log.LOG_DIR}`', force=True)