def get_post_with_greed(url_body, parent_doc=None): pure_url_body = re.split("[?#]", url_body)[0] post_url = pure_url_body d = get_data_namespace() post_doc = d.get_doc(post_url) current_data_root = get_current_data_root() parent_doc = parent_doc or get_doc_in_request() if not post_doc and is_a_markdown_file(post_url) and parent_doc and isinstance(parent_doc, dict): filename = post_url if "/post/" in filename: filename = filename.split("/post/", 1)[-1] parent_post_doc_path = get_path_from_record(parent_doc) if parent_post_doc_path: post_doc_parent = os.path.split(parent_post_doc_path)[0] if post_doc_parent: abs_path = "%s/%s" % (post_doc_parent.strip("/"), filename.strip("/")) post_doc = d.get_doc_by_path(abs_path) if current_data_root and not post_doc: # 增加 wiki_root 作为前缀,再尝试匹配 abs_path = "%s/%s" % (current_data_root, filename.strip("/")) post_doc = d.get_doc_by_path(abs_path) if not post_doc: # 尝试 hit keyword 的方式进行搜索匹配 bucket = get_bucket_in_request_context() post_name = (get_get_var(url_body, "name") or "").strip() if post_name: if "." in post_name: post_name = os.path.splitext(post_name)[0] post_doc = get_one_post_by_es(bucket, keywords=post_name, under=current_data_root) if not post_doc and is_a_markdown_file(post_url): # 直接搜索 filename just_post_file_name = get_just_name(post_url) if just_post_file_name != post_name: post_doc = get_one_post_by_es(bucket, keywords=just_post_file_name, under=current_data_root) return post_doc
def load_all_posts_visits_from_csv(bucket, csv_file_record): visits_db_name = get_visits_db_name_for_bucket(bucket) current_visits_size = hsize(visits_db_name) if current_visits_size > 5000: # 如果超过了 5k 的数量,先clear,避免过度冗余 hclear(visits_db_name) raw_objects = csv_file_record.get('objects') or [] if not raw_objects: return if not isinstance(raw_objects, (list, tuple)): return for data_obj in raw_objects[:3000]: # 最多处理 3k 条记录,避免一个 bucket 过于庞大,出现性能问题 if not isinstance(data_obj, dict): continue path = data_obj.get('path') if not path or not isinstance(path, string_types): continue path = path.strip('/').lower() if not is_a_markdown_file(path): continue visits = to_int(data_obj.get('visits'), default_if_fail=0) visitors = to_int(data_obj.get('visitors'), default_if_fail=0) visits_key = get_visits_key(path, field='visits') visitors_key = get_visits_key(path, field='visitors') hset(visits_db_name, visits_key, visits) hset(visits_db_name, visitors_key, visitors)
def get_bucket_markdown_record_ids(bucket, under='', max_limit=20000): record_ids = [] paths_and_ids = get_paths_and_ids_under(bucket=bucket, under=under, max_limit=max_limit) for (path, record_id) in paths_and_ids: if is_a_markdown_file(path): record_ids.append(record_id) return record_ids
def should_md_doc_hit_folder_compiler(self): if not self.filepath: return False if not os.path.isfile(self.filepath): return False if not is_a_markdown_file(self.filepath): return False just_name = get_just_name(self.filepath) if just_name == 'index': return True else: return False
def get_template_info(template_dir): info = {} template_dir = template_dir.strip().rstrip('/') if not os.path.isdir(template_dir): return info # ignore filepaths = get_all_sub_files(template_dir, accept_func=os.path.isfile, max_tried_times=1000) for filepath in filepaths: relative_path = get_relative_path( filepath, root=template_dir).lower() # lower case if not os.path.isfile(filepath): continue if not is_real(filepath) or is_a_hidden_path(relative_path): continue if relative_path.startswith('readme.') and is_a_markdown_file( relative_path): # 模板 readme 上的信息 with open(filepath, 'rb') as f: raw_markdown_content = smart_unicode(f.read()) compiled_markdown_content = compile_markdown(raw_markdown_content) compiled_markdown_content_meta = compiled_markdown_content.metadata readme_info = dict(content=compiled_markdown_content, metadata=compiled_markdown_content_meta ) # raw_content=raw_markdown_content, info['_readme'] = readme_info else: path_without_ext, ext = os.path.splitext(relative_path) ext = ext.strip('.').lower() if ext not in allowed_exts: continue with open(filepath, 'rb') as f: raw_content = f.read() raw_content = smart_unicode(raw_content) # to unicode info[relative_path] = raw_content matched_compiler = template_resource_compilers.get(ext) if matched_compiler: new_ext, compile_func = matched_compiler try: compiled_content = compile_func(raw_content) new_key = path_without_ext + '.' + new_ext.strip('.') info[new_key] = compiled_content except Exception as e: error_message = getattr(e, 'message', None) if error_message: try: print('%s error: %s' % (relative_path, error_message)) except: pass info['_route'] = get_templates_route_info(info) return info
def append_to_markdown_record(bucket, relative_path, content_to_append, lines_to_append=1, more_line_when_seconds_passed=0, position='tail'): record = get_record_by_path(bucket, path=relative_path) or {} record_type = record.get('_type') or record.get('type') if record_type != 'post': return 'ignore' # ignore if not is_a_markdown_file(relative_path): return 'ignore' old_content = record.get('raw_content') or record.get('content') or '' old_content = smart_unicode(old_content) now = time.time() old_timestamp = record.get('timestamp') if more_line_when_seconds_passed and old_timestamp and isinstance( old_timestamp, (int, float)): # 超过多少 seconds 之后,就会自动空一行,相当于产生了一个『段落』的逻辑 diff = now - old_timestamp if diff > more_line_when_seconds_passed: lines_to_append += 1 interval_empty_lines = '\r\n' * abs(to_int(lines_to_append, max_value=10)) # 间隔换行 content_to_append = smart_unicode(content_to_append).strip() if old_content.endswith( '\n' + content_to_append) or old_content == content_to_append: return 'ignore' # ignore, 重复的内容不处理 if position == 'tail': # 默认插入尾巴 new_content = '%s%s' % (interval_empty_lines, content_to_append) content = '%s%s' % (old_content, new_content) else: new_content = '%s%s' % (content_to_append, interval_empty_lines) content = '%s%s' % (new_content, old_content) error_info = sync_file_by_server_side(bucket=bucket, relative_path=relative_path, content=content) return error_info
def json_data_for_sync(self): if not self.relative_path: return # ignore if self.filepath and not os.path.exists( self.filepath) and not self.is_deleted: return # ignore too # 这些都是 BasicSyncCompiler 接收的参数 kwargs = dict( relative_path=self.relative_path, real_relative_path=self.real_relative_path, abs_filepath=self.filepath, private_key=self.private_key, should_encrypt_file=self.should_encrypt_file, is_dir=self.is_dir, is_deleted=self.is_deleted, ipfs_key=self.ipfs_key, version=self.version, raw_content=self._raw_content, files_info=self.files_info, ) matched_compiler = None is_markdown = is_a_markdown_file(self.relative_path) if self._raw_content: is_file = True elif self.filepath: is_file = os.path.isfile(self.filepath) elif self.is_dir: is_file = False else: is_file = True if self.is_dir: matched_compiler = FolderSyncCompiler(**kwargs) elif is_markdown and self.relative_path not in FILE_TYPE_FILENAMES: matched_compiler = PostSyncCompiler(**kwargs) elif is_file and self.relative_path in VISITS_FILEPATHS: matched_compiler = VisitsSyncCompiler(**kwargs) elif is_file and self.lower_relative_path.startswith('_comments/'): matched_compiler = CommentsSyncCompiler(**kwargs) if matched_compiler: matched_data = matched_compiler.compiled_data doc_type = matched_data.get('_type') # 这里会调用 compile 相关获取数据的逻辑,如果 compiler.should_ignore_current_file, 则返回 {}, 相当于不同步 if matched_compiler.should_ignore_current_file: return {} if matched_data: matched_json_data = self.json_dumps(matched_data) if len(matched_json_data) < MAX_RECORD_SIZE: return matched_json_data else: # 如果 size 超了,会后面走 file 类型的逻辑,作为通用 record if doc_type in ['visits', 'comments']: # 这些类型,不作为普通 file 类型处理,如果超过 300 kb,就是无效 # 由于转化数据的存在,实际上 100kb 左右的size return {} # 上面类型匹配失败或者 size 太大,作为普通的 record 处理 common_file_compiler = FileSyncCompiler(**kwargs) compiled_data = common_file_compiler.compiled_data if compiled_data: compiled_json_data = self.json_dumps(compiled_data) if len(compiled_json_data) < MAX_RECORD_SIZE: return compiled_json_data data = common_file_compiler.basic_compiled_data compiled_json_data = self.json_dumps(data) return compiled_json_data
def get_files_info(bucket): data = {} path_bucket = get_bucket_name_for_path(bucket) data['files'] = {} data['folders'] = {} data['lower_files'] = {} data['lower_folders'] = [] # not a dict lower_folders = [] lower_folders_count = {} records = hscan(path_bucket, key_start='', limit=20000) for filepath, filepath_data_string in records: if filepath.startswith('_'): continue lower_filepath = filepath.strip().lower() # prepare raw data starts raw_filepath_data = filepath_data_string.split(',') if len(raw_filepath_data) != 3: continue filepath_data_keys = ['record_id', 'size', 'version'] filepath_data = dict(zip(filepath_data_keys, raw_filepath_data)) filepath_data['size'] = to_int(filepath_data['size'], default_if_fail=0) if filepath_data.get('version') == 'folder': #is_dir = True is_image = False is_markdown = False data['folders'][filepath] = filepath_data if lower_filepath not in lower_folders: lower_folders.append(lower_filepath) else: #is_dir = False # prepare raw data ends is_image = is_a_image_file(filepath) is_markdown = is_a_markdown_file(filepath) data['files'][filepath] = filepath_data data['lower_files'][filepath.strip().lower()] = filepath_data lower_folder_path = os.path.split(filepath.strip().lower())[0] if lower_folder_path: parts = lower_folder_path.split('/') parts_length = len(parts) if parts_length > 10: continue for i in range(parts_length): one_lower_folder_path = '/'.join(parts[:i + 1]) last_path_part = one_lower_folder_path.split('/')[-1] if last_path_part.startswith('_'): continue if one_lower_folder_path not in lower_folders: lower_folders.append(one_lower_folder_path) if one_lower_folder_path: images_count_plus = 1 if is_image else 0 posts_count_plus = 1 if is_markdown else 0 _images_count_plus = 1 if images_count_plus and lower_folder_path == one_lower_folder_path else 0 _posts_count_plus = 1 if posts_count_plus and lower_folder_path == one_lower_folder_path else 0 matched_count = lower_folders_count.setdefault(one_lower_folder_path, {}) matched_count['images_count'] = matched_count.get('images_count', 0) + images_count_plus matched_count['posts_count'] = matched_count.get('posts_count', 0) + posts_count_plus matched_count['_images_count'] = matched_count.get('_images_count', 0) + _images_count_plus matched_count['_posts_count'] = matched_count.get('_posts_count', 0) + _posts_count_plus data['lower_folders'] = lower_folders data['lower_folders_count'] = lower_folders_count data['date'] = time.time() return data
def get_linked_docs_from_markdown_content(path, raw_content, md_link_abs_check_func=None): # return [unicode] if not raw_content: return [], [] if not isinstance(raw_content, string_types): return [], [] raw_content = smart_unicode(raw_content) # [xxx](/???.md) maybe_md_links = [] for m in re.finditer("(?:(?<=^)|(?<!!))(\\[.*?\\])\\((.*?)\\)", raw_content): link = m.group(2) if "://" in link: continue if "?" in link: link = link.split("?")[0] if "#" in link and not link.startswith("#"): link = link.split("#", 1)[0] link = link.strip() if is_a_markdown_file(link): # here, must be a markdown file if not link in maybe_md_links: maybe_md_links.append(link) for m in re.finditer("(?<!\[)(\[\[)([^\[\]]+)(\]\])", raw_content): # [[ xxx ]] # [[ xxx | title ]] # [[ xxx | title # id ]] link = m.group(2) link, link_title, link_id = get_link_title_id_in_wiki_syntax(link) if "?" in link: link = link.split("?")[0] if "#" in link and not link.startswith("#"): link = link.split("#", 1)[0] if not link: continue link = link.strip() if link not in maybe_md_links: maybe_md_links.append(link) # 校验 tags = [] post_parent_path = path.strip("/").rsplit("/", 1)[0] link_paths = [] for maybe_md_link in maybe_md_links: if maybe_md_link.startswith("#"): tag = maybe_md_link.lstrip("#") if tag not in tags: tags.append(tag) continue if not is_a_markdown_file( maybe_md_link): # by default add .md ext to the link maybe_md_link += ".md" if maybe_md_link.startswith("/"): # 相对于根目录下已经是完整的地址了 link = maybe_md_link else: if md_link_abs_check_func and md_link_abs_check_func( maybe_md_link): # 函数判断,可以省略了 /, 但此时又进行了补全 link = "/%s" % maybe_md_link.strip("/") else: link = "%s/%s" % (post_parent_path, maybe_md_link.strip("/")) if not link: continue # 全小写化处理 link = link.lower().lstrip("/") if link not in link_paths: link_paths.append(link) return link_paths, tags
def append_to_markdown_doc_and_sync(bucket, path, content, lines_to_append=1, reverse=False, do_not_repeat=True, lines_more_diff=None, draft_by_default=False): # 默认检测 append 的内容是否重复 if not bucket or not path or not content: return if not isinstance(bucket, string_types) or not isinstance( path, string_types) or not isinstance(content, string_types): return if not has_bucket(bucket): return if not is_a_markdown_file(path): return content = smart_unicode(content) old_doc = get_record_by_path(bucket, path=path) or {} if not isinstance(old_doc, dict): old_doc = {} if lines_more_diff: # 多长时间hi后,自动多一空行 if old_doc and old_doc.get('timestamp'): try: diff = time.time() - old_doc.get('timestamp') if diff > lines_more_diff: lines_to_append += 1 except: pass interval = '\r\n' * abs(to_int(lines_to_append, max_value=10)) # 间隔换行 if old_doc: if get_type_from_record(old_doc) == 'post': # 目前仅支持日志类型文件的append old_content = old_doc.get('raw_content') if old_content == " ": old_content = "" if do_not_repeat: if reverse: if old_content.strip().startswith(content.strip()): return "" else: old_content_s = old_content.strip() appended_content_s = content.strip() if old_content_s.endswith( '\n' + appended_content_s ) or old_content_s == appended_content_s: return '' # ignore, 重复的内容不处理 if reverse: # 插入头部位置 new_content = '%s%s' % (content, interval) content = '%s%s' % (new_content, old_content) else: new_content = '%s%s' % (interval, content) content = '%s%s' % (old_content, new_content) else: return else: # new doc content = content.strip() if draft_by_default: # 新建文档默认是 draft 的状态 if re.match(u"\w+[:\uff1a]", content): # 可能用户自己声明了 metadata content = "status: draft\n%s" % content else: now = get_now_from_bucket(bucket) content = "date: %s\nstatus: draft\n\n%s" % (now, content) sync_file_by_server_side(bucket=bucket, relative_path=path, content=content) return True # done
def update_post_tags_words_info(bucket, record_data): path = get_path_from_record(record_data) lower_path = path.lower().lstrip('/') if not path: return if not is_a_markdown_file(path): return if lower_path.startswith('_nav/'): return posts_info = get_bucket_posts_info(bucket) or {} # data init bucket_text_words = to_int(posts_info.get('text_words') or 0, default_if_fail=0) # prepare tags info tags_info = posts_info.setdefault( 'tags', {}) # {'paths':{path:[tag1,tag2]} , 'tags': {'tag':[path1, path2]} } tags_info_tags = tags_info.setdefault('tags', {}) tags_info_paths = tags_info.setdefault('paths', {}) # prepare links info links_info = posts_info.setdefault( "links", {}) # {'paths': {path:[back_path1, back_path2]]} , # 'back_paths': {'back_path':[path1, path2]} } links_info_links = links_info.setdefault("links", {}) links_info_paths = links_info.setdefault("paths", {}) words_info = posts_info.setdefault('words', {}) # {'path': text_words} is_deleted = record_data.get('is_deleted', False) post_status = record_data.get('status') or 'public' post_tags = record_data.get('tags') or [] if not isinstance(post_tags, (list, tuple)): post_tags = [] post_doc_links, wiki_tags = get_linked_docs_from_markdown_content( path, record_data.get("raw_content"), md_link_abs_check_func=partial(has_record_by_path, bucket)) if not isinstance(post_doc_links, (list, tuple)): post_doc_links = [] text_words = to_int(record_data.get('text_words'), default_if_fail=0) # 如果已有 words 的信息,先减去,避免重复计算 old_text_words = to_int(words_info.get(lower_path), default_if_fail=0) if old_text_words: bucket_text_words -= old_text_words # deleted, 草稿类似的状态,不进入计算;去除相关的信息 if is_deleted or post_status in ['draft', 'private']: words_info.pop(lower_path, None) # handle delete tags old_tags = tags_info_paths.get(lower_path) if not isinstance(old_tags, (list, tuple)): old_tags = [] old_tags = [smart_unicode(tag) for tag in old_tags] tags_info_paths.pop(lower_path, None) for tag in old_tags: tags_info_tags_for_tag_paths = tags_info_tags.setdefault(tag, []) if lower_path in tags_info_tags_for_tag_paths: tags_info_tags_for_tag_paths.remove(lower_path) if not tags_info_tags_for_tag_paths: # 空的 tags tags_info_tags.pop(tag, None) # handle delete links old_links = links_info_paths.get(lower_path) if not isinstance(old_links, (list, tuple)): old_links = [] old_links = [smart_unicode(link) for link in old_links] links_info_paths.pop(lower_path, None) for link in old_links: links_info_link_back_paths = links_info_links.setdefault(link, []) if lower_path in links_info_link_back_paths: links_info_link_back_paths.remove(lower_path) if not links_info_link_back_paths: # 空的 links 了 links_info_links.pop(link, None) else: bucket_text_words += text_words words_info[lower_path] = text_words # handle tags if post_tags: tags_info_paths[lower_path] = post_tags for tag in post_tags: tags_info_tags_for_tag_paths = tags_info_tags.setdefault(tag, []) if lower_path not in tags_info_tags_for_tag_paths: tags_info_tags_for_tag_paths.append(lower_path) empty_tags = [] for tag, paths_tagged in tags_info_tags.items(): if not paths_tagged: empty_tags.append(tag) continue if not isinstance(paths_tagged, list): continue if lower_path in paths_tagged and tag not in post_tags: paths_tagged.remove(lower_path) if not paths_tagged: empty_tags.append(tag) for empty_tag in empty_tags: tags_info_tags.pop(empty_tag, None) # handle links if post_doc_links: links_info_paths[lower_path] = post_doc_links for link in post_doc_links: links_info_link_back_paths = links_info_links.setdefault(link, []) if lower_path not in links_info_link_back_paths: links_info_link_back_paths.append(lower_path) empty_links = [] for link, paths_linked in links_info_links.items(): if not paths_linked: empty_links.append(link) continue if not isinstance(paths_linked, list): continue if lower_path in paths_linked and link not in post_doc_links: paths_linked.remove(lower_path) if not paths_linked: empty_links.append(link) for empty_link in empty_links: links_info_links.pop(empty_link, None) if bucket_text_words < 0: bucket_text_words = 0 posts_info['text_words'] = bucket_text_words set_bucket_configs(bucket, configs=posts_info, config_type='posts')