def empty_post(): "mock a post" post = utils.create_objdict() post.md = "" post.html = "" post.meta = utils.create_objdict() post.meta.statistics = utils.create_objdict() post.meta.toc = utils.create_objdict() post.elements = utils.create_objdict() return post
def compute_stats(doc): ts = TextStats(doc) stats = create_objdict() counts = {'sentences': ts.n_sents, 'words': ts.n_words, 'unique_words': ts.n_unique_words, 'chars': ts.n_chars, 'chars_per_word': ts.n_chars_per_word, 'long_words': ts.n_long_words, 'syllables': ts.n_syllables, 'syllables_per_word': ts.n_syllables_per_word, 'monosyllable_words': ts.n_monosyllable_words, 'polysyllable_words': ts.n_polysyllable_words } stats.counts = dict_to_objdict(counts) readability = {} if stats.counts.words > 0: readability = {'flesch_kincaid_grade_level': ts.flesch_kincaid_grade_level, 'flesch_reading_ease': ts.flesch_reading_ease, 'smog_index': 0, 'gunning_fog_index': ts.gunning_fog_index, 'coleman_liau_index': ts.coleman_liau_index, 'automated_readability_index': ts.automated_readability_index, 'lix': ts.lix, } if stats.counts.sentences >= 30: readability['smog_index'] = ts.smog_index stats.readability = dict_to_objdict(readability) return stats
def process(self, unused, site, config): try: num_related_posts = config.num_related_posts # Tokenize docs = [] valid_posts = [] #exclude pages that are not posts for post in site.posts: if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT: continue txt = post.md docs.append( gensim.utils.simple_preprocess(txt, deacc=True, min_len=3, max_len=15)) valid_posts.append(post) # Fixme stemming # build model dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] tfidf = models.tfidfmodel.TfidfModel(corpus=corpus) # Fixme: get correct number of topics num_topics = len( site.posts ) / 5 # use the number of post as proxy for number of topics topic_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=num_topics) index = similarities.MatrixSimilarity( topic_model[tfidf[corpus]], num_best=num_related_posts + 1) #+1 because the best one is itself # find simlar posts and store them log_details = "" for post, sims in zip(valid_posts, index): if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT: continue post.meta.related_posts = [] log_details += '<div class="subsection"><h3>%s</h3>Related posts:<ol>' % ( post.meta.title) for idx, score in sims[ 1:]: #1: > first one is the article itself p = valid_posts[idx] o = utils.create_objdict() o.meta = p.meta o.score = score o.html = p.score post.meta.related_posts.append(o) log_details += '<li>%s (%s)</li>' % (o.meta.title, round(score, 2)) log_details += '<ol></div>' return (SiteFab.OK, "Related posts via LSI", log_details) except Exception as e: return (SiteFab.ERROR, "Related posts via LSI", e)
def test_objdict_to_dict(): od = utils.create_objdict() od.str = "str" od.int = 1 od.array = [1, 2, 3] od.arrayofarray = [[1, 2], [3, 4]] d = utils.objdict_to_dict(od) assert d['str'] == "str" assert d['int'] == 1 assert d['array'][2] == 3 assert d['arrayofarray'][0][0] == 1 # testing nested array assert d['arrayofarray'][1][1] == 4 # testing nested array
def analyze_post(post, debug=False): "Perform NLP analysis" counters = PerfCounters() nlp = create_objdict() # clean fields counters.start('cleanup') clean_fields = generate_clean_fields(post) nlp.clean_fields = clean_fields counters.stop('cleanup') # creating spacy docs counters.start('make_spacy_docs') all_cleaned_content = ' '.join([clean_fields.title, clean_fields.category, " ".join(clean_fields.tags), clean_fields.abstract, clean_fields.text]) # overall terms cleaned_doc = make_spacy_doc(all_cleaned_content, lang=SPACY_MODEL) # title terms title_doc = make_spacy_doc(clean_fields.title, lang=SPACY_MODEL) # for statistics text_doc = make_spacy_doc(post.text, lang=SPACY_MODEL) counters.stop('make_spacy_docs') # terms extraction counters.start('extract_key_terms') nlp.terms = extract_key_terms(cleaned_doc, num_terms=NUM_TERMS, algo=TERM_EXTRACTOR_ALGO, ngrams=NGRAMS) # !note we restrict ngram to one as we only want the lemmized top terms. nlp.title_terms = extract_key_terms(title_doc, num_terms=NUM_TERMS, algo=TERM_EXTRACTOR_ALGO, ngrams=1) counters.stop('extract_key_terms') # text stats counters.start('text_stats') nlp.stats = compute_stats(text_doc) counters.stop('text_stats') if debug: counters.report() return nlp
def generate_clean_fields(post): "Generate a cleaned up version of the post and its metadata" clean_fields = create_objdict() # cleaned up fields clean_fields.title = '' if post.meta.title: clean_fields.title = text_cleanup(post.meta.title) clean_fields.abstract = "" if post.meta.abstract: clean_fields.abstract = text_cleanup(post.meta.abstract) clean_fields.authors = [] if post.meta.authors: for author in post.meta.authors: clean_fields.authors.append(text_cleanup(author)) # conference clean_fields.conference_name = [] if post.meta.conference_name: clean_fields.conference_name = text_cleanup( post.meta.conference_name) clean_fields.conference_short_name = "" if post.meta.conference_short_name: clean_fields.conference_short_name = text_cleanup( post.meta.conference_short_name) # category, tags, etc clean_fields.category = "" if post.meta.category: clean_fields.category = text_cleanup(post.meta.category) clean_fields.tags = [] if post.meta.tags: for tag in post.meta.tags: clean_fields.tags.append(text_cleanup(tag)) # text clean_fields.text = '' if post.text: # !make sure to use post html and clean it to avoid markup keywords. clean_fields.text = text_cleanup(post.text) return clean_fields
def lint(self, post, rendered_post, site): """ Load yaml configuration Args: post (Post): the post to analyze rendered_post (str): the html version of the post site (Sitefab): the site object mainly used to get access to plugin data Return: dict: linting results """ results = utils.create_objdict() results.has_errors = 0 results.has_warnings = 0 # frontmatter results.info = frontmatter.lint(post, self.test_info, self.config) # images if 'image_info' in site.plugin_data: image_info = site.plugin_data['image_info'] else: image_info = None img_results = images.lint(post, self.test_info, self.config, image_info) results.info.extend(img_results) stucture_results = structure.lint(post, self.test_info, self.config) results.info.extend(stucture_results) for d in results.info: if d[0][0] == "E": results.has_errors += 1 if d[0][1] == "W": results.has_warnings += 1 if results.has_errors or results.has_warnings: self.results[post.filename] = results return results
def __init__(self, config_filename, version='1.0'): # Timers self.cnts = PerfCounters() self.cnts.start('Overall') self.cnts.start('Init') # [configuration] self.current_dir = Path.cwd() # make the config file path absolute to avoid weird cases self.config_filename = Path(config_filename).resolve() if not config_filename: raise Exception("Supply a configuration filename") # exist? if self.config_filename.is_file(): # absolute path self.config = files.load_config(self.config_filename) else: utils.error("Config file %s not found" % self.config_filename) # site root dir is -1 from where the config is self.config.root_dir = self.config_filename.parents[1] self.config.build = utils.create_objdict() # expose sitefab version to the templates self.config.build.sitefab_version = version # [parser] # # initialize the parser config parser_tpl_path = Path(self.config.parser.template_dir) self.config.parser.templates_path = (self.config.root_dir / parser_tpl_path) self.config.parser = Parser.make_config(self.config.parser) # [plugins] # loading configuration for d in self.config.plugins.config_dir: config_dir = self.config.root_dir / d # load the various config files from disk plugins_config = defaultdict(dict) for config_fname in files.get_files_list(config_dir, '*.yaml'): plugin_name = config_fname.stem category = "%s%s" % (str(config_fname.parts[-3]).capitalize(), str(config_fname.parts[-2]).capitalize()) config = files.load_config(config_fname) plugins_config[category][plugin_name] = config # where to redirect the standard python log debug_log_fname = self.get_logs_dir() / "debug.log" self.plugins = Plugins(self.get_plugins_dirs(), debug_log_fname, plugins_config) # Store data generated by plugins that can be used later. self.plugin_data = {} self.plugin_results = defaultdict(int) # [template rendering engine] # self.jinja2 = Environment(loader=FileSystemLoader( str(self.get_template_dir())), extensions=['jinja2.ext.do']) # loading templates custom functions custom_filters = self.plugins.get_template_filters() for flt_name, flt_fct in custom_filters.items(): self.jinja2.filters[flt_name] = flt_fct # [logger] # cfg = utils.create_objdict() cfg.output_dir = self.get_logs_dir() # log template not the one from the users. cfg.template_dir = (self.config.root_dir / self.config.logger.template_dir) tpl_dir = self.config.root_dir / Path(self.config.logger.template_dir) self.config.logger.template_dir = tpl_dir cfg.log_template = "log.html" cfg.log_index_template = "log_index.html" # noqa cfg.stats_template = "stats.html" self.logger = Logger(cfg, self) # [linter] # linter_config_filename = (self.config.root_dir / self.config.linter.configuration_file) linter_config = files.load_config(linter_config_filename) linter_config.report_template_file = ( self.config.root_dir / self.config.linter.report_template_file) linter_config.output_dir = self.get_logs_dir() linter_config.site_output_dir = self.get_output_dir() self.linter = Linter(linter_config) # Finding content and assets. self.filenames = utils.create_objdict() self.filenames.posts = files.get_files_list(self.get_content_dir(), "*.md") # Cleanup the output directories. files.clean_dir(self.get_output_dir()) self.cnts.stop('Init')
def test_create_objdict(): od = utils.create_objdict() od2 = objdict() assert type(od) == type(od2)