Example #1
0
def get_post_with_greed(url_body, parent_doc=None):
    pure_url_body = re.split("[?#]", url_body)[0]
    post_url = pure_url_body
    d = get_data_namespace()
    post_doc = d.get_doc(post_url)
    current_data_root = get_current_data_root()
    parent_doc = parent_doc or get_doc_in_request()
    if not post_doc and is_a_markdown_file(post_url) and parent_doc and isinstance(parent_doc, dict):
        filename = post_url
        if "/post/" in filename:
            filename = filename.split("/post/", 1)[-1]
        parent_post_doc_path = get_path_from_record(parent_doc)
        if parent_post_doc_path:
            post_doc_parent = os.path.split(parent_post_doc_path)[0]
            if post_doc_parent:
                abs_path = "%s/%s" % (post_doc_parent.strip("/"), filename.strip("/"))
                post_doc = d.get_doc_by_path(abs_path)

        if current_data_root and not post_doc:  # 增加 wiki_root 作为前缀,再尝试匹配
            abs_path = "%s/%s" % (current_data_root, filename.strip("/"))
            post_doc = d.get_doc_by_path(abs_path)

    if not post_doc:  # 尝试 hit keyword 的方式进行搜索匹配
        bucket = get_bucket_in_request_context()
        post_name = (get_get_var(url_body, "name") or "").strip()
        if post_name:
            if "." in post_name:
                post_name = os.path.splitext(post_name)[0]
            post_doc = get_one_post_by_es(bucket, keywords=post_name, under=current_data_root)
        if not post_doc and is_a_markdown_file(post_url): # 直接搜索 filename
            just_post_file_name = get_just_name(post_url)
            if just_post_file_name != post_name:
                post_doc = get_one_post_by_es(bucket, keywords=just_post_file_name, under=current_data_root)
    return post_doc
Example #2
0
def load_all_posts_visits_from_csv(bucket, csv_file_record):
    visits_db_name = get_visits_db_name_for_bucket(bucket)
    current_visits_size = hsize(visits_db_name)
    if current_visits_size > 5000:  # 如果超过了 5k 的数量,先clear,避免过度冗余
        hclear(visits_db_name)
    raw_objects = csv_file_record.get('objects') or []
    if not raw_objects:
        return
    if not isinstance(raw_objects, (list, tuple)):
        return
    for data_obj in raw_objects[:3000]:
        # 最多处理 3k 条记录,避免一个 bucket 过于庞大,出现性能问题
        if not isinstance(data_obj, dict):
            continue
        path = data_obj.get('path')
        if not path or not isinstance(path, string_types):
            continue
        path = path.strip('/').lower()
        if not is_a_markdown_file(path):
            continue
        visits = to_int(data_obj.get('visits'), default_if_fail=0)
        visitors = to_int(data_obj.get('visitors'), default_if_fail=0)
        visits_key = get_visits_key(path, field='visits')
        visitors_key = get_visits_key(path, field='visitors')
        hset(visits_db_name, visits_key, visits)
        hset(visits_db_name, visitors_key, visitors)
Example #3
0
def get_bucket_markdown_record_ids(bucket, under='', max_limit=20000):
    record_ids = []
    paths_and_ids = get_paths_and_ids_under(bucket=bucket,
                                            under=under,
                                            max_limit=max_limit)
    for (path, record_id) in paths_and_ids:
        if is_a_markdown_file(path):
            record_ids.append(record_id)
    return record_ids
Example #4
0
 def should_md_doc_hit_folder_compiler(self):
     if not self.filepath:
         return False
     if not os.path.isfile(self.filepath):
         return False
     if not is_a_markdown_file(self.filepath):
         return False
     just_name = get_just_name(self.filepath)
     if just_name == 'index':
         return True
     else:
         return False
Example #5
0
def get_template_info(template_dir):
    info = {}
    template_dir = template_dir.strip().rstrip('/')
    if not os.path.isdir(template_dir):
        return info  # ignore
    filepaths = get_all_sub_files(template_dir,
                                  accept_func=os.path.isfile,
                                  max_tried_times=1000)
    for filepath in filepaths:
        relative_path = get_relative_path(
            filepath, root=template_dir).lower()  # lower case
        if not os.path.isfile(filepath):
            continue
        if not is_real(filepath) or is_a_hidden_path(relative_path):
            continue
        if relative_path.startswith('readme.') and is_a_markdown_file(
                relative_path):  # 模板 readme 上的信息
            with open(filepath, 'rb') as f:
                raw_markdown_content = smart_unicode(f.read())
            compiled_markdown_content = compile_markdown(raw_markdown_content)
            compiled_markdown_content_meta = compiled_markdown_content.metadata
            readme_info = dict(content=compiled_markdown_content,
                               metadata=compiled_markdown_content_meta
                               )  # raw_content=raw_markdown_content,
            info['_readme'] = readme_info
        else:
            path_without_ext, ext = os.path.splitext(relative_path)
            ext = ext.strip('.').lower()
            if ext not in allowed_exts:
                continue
            with open(filepath, 'rb') as f:
                raw_content = f.read()
            raw_content = smart_unicode(raw_content)  # to unicode
            info[relative_path] = raw_content
            matched_compiler = template_resource_compilers.get(ext)
            if matched_compiler:
                new_ext, compile_func = matched_compiler
                try:
                    compiled_content = compile_func(raw_content)
                    new_key = path_without_ext + '.' + new_ext.strip('.')
                    info[new_key] = compiled_content
                except Exception as e:
                    error_message = getattr(e, 'message', None)
                    if error_message:
                        try:
                            print('%s error: %s' %
                                  (relative_path, error_message))
                        except:
                            pass
    info['_route'] = get_templates_route_info(info)
    return info
Example #6
0
def append_to_markdown_record(bucket,
                              relative_path,
                              content_to_append,
                              lines_to_append=1,
                              more_line_when_seconds_passed=0,
                              position='tail'):
    record = get_record_by_path(bucket, path=relative_path) or {}
    record_type = record.get('_type') or record.get('type')
    if record_type != 'post':
        return 'ignore'  # ignore
    if not is_a_markdown_file(relative_path):
        return 'ignore'
    old_content = record.get('raw_content') or record.get('content') or ''
    old_content = smart_unicode(old_content)
    now = time.time()
    old_timestamp = record.get('timestamp')

    if more_line_when_seconds_passed and old_timestamp and isinstance(
            old_timestamp, (int, float)):
        # 超过多少 seconds 之后,就会自动空一行,相当于产生了一个『段落』的逻辑
        diff = now - old_timestamp
        if diff > more_line_when_seconds_passed:
            lines_to_append += 1

    interval_empty_lines = '\r\n' * abs(to_int(lines_to_append,
                                               max_value=10))  # 间隔换行

    content_to_append = smart_unicode(content_to_append).strip()

    if old_content.endswith(
            '\n' + content_to_append) or old_content == content_to_append:
        return 'ignore'  # ignore, 重复的内容不处理

    if position == 'tail':  # 默认插入尾巴
        new_content = '%s%s' % (interval_empty_lines, content_to_append)
        content = '%s%s' % (old_content, new_content)
    else:
        new_content = '%s%s' % (content_to_append, interval_empty_lines)
        content = '%s%s' % (new_content, old_content)
    error_info = sync_file_by_server_side(bucket=bucket,
                                          relative_path=relative_path,
                                          content=content)
    return error_info
Example #7
0
    def json_data_for_sync(self):
        if not self.relative_path:
            return  # ignore
        if self.filepath and not os.path.exists(
                self.filepath) and not self.is_deleted:
            return  # ignore too

        # 这些都是 BasicSyncCompiler 接收的参数
        kwargs = dict(
            relative_path=self.relative_path,
            real_relative_path=self.real_relative_path,
            abs_filepath=self.filepath,
            private_key=self.private_key,
            should_encrypt_file=self.should_encrypt_file,
            is_dir=self.is_dir,
            is_deleted=self.is_deleted,
            ipfs_key=self.ipfs_key,
            version=self.version,
            raw_content=self._raw_content,
            files_info=self.files_info,
        )

        matched_compiler = None
        is_markdown = is_a_markdown_file(self.relative_path)
        if self._raw_content:
            is_file = True
        elif self.filepath:
            is_file = os.path.isfile(self.filepath)
        elif self.is_dir:
            is_file = False
        else:
            is_file = True
        if self.is_dir:
            matched_compiler = FolderSyncCompiler(**kwargs)
        elif is_markdown and self.relative_path not in FILE_TYPE_FILENAMES:
            matched_compiler = PostSyncCompiler(**kwargs)
        elif is_file and self.relative_path in VISITS_FILEPATHS:
            matched_compiler = VisitsSyncCompiler(**kwargs)
        elif is_file and self.lower_relative_path.startswith('_comments/'):
            matched_compiler = CommentsSyncCompiler(**kwargs)

        if matched_compiler:
            matched_data = matched_compiler.compiled_data
            doc_type = matched_data.get('_type')
            # 这里会调用 compile 相关获取数据的逻辑,如果 compiler.should_ignore_current_file, 则返回 {}, 相当于不同步
            if matched_compiler.should_ignore_current_file:
                return {}
            if matched_data:
                matched_json_data = self.json_dumps(matched_data)
                if len(matched_json_data) < MAX_RECORD_SIZE:
                    return matched_json_data
                else:
                    # 如果 size 超了,会后面走 file 类型的逻辑,作为通用 record
                    if doc_type in ['visits', 'comments']:
                        # 这些类型,不作为普通 file 类型处理,如果超过 300 kb,就是无效
                        # 由于转化数据的存在,实际上 100kb 左右的size
                        return {}

        # 上面类型匹配失败或者 size 太大,作为普通的 record 处理
        common_file_compiler = FileSyncCompiler(**kwargs)
        compiled_data = common_file_compiler.compiled_data
        if compiled_data:
            compiled_json_data = self.json_dumps(compiled_data)
            if len(compiled_json_data) < MAX_RECORD_SIZE:
                return compiled_json_data

        data = common_file_compiler.basic_compiled_data
        compiled_json_data = self.json_dumps(data)
        return compiled_json_data
Example #8
0
def get_files_info(bucket):
    data = {}
    path_bucket = get_bucket_name_for_path(bucket)
    data['files'] = {}
    data['folders'] = {}
    data['lower_files'] = {}
    data['lower_folders'] = []  # not a dict
    lower_folders = []
    lower_folders_count = {}
    records = hscan(path_bucket, key_start='', limit=20000)
    for filepath, filepath_data_string in records:
        if filepath.startswith('_'):
            continue
        lower_filepath = filepath.strip().lower()
        # prepare raw data starts
        raw_filepath_data = filepath_data_string.split(',')
        if len(raw_filepath_data) != 3:
            continue
        filepath_data_keys = ['record_id', 'size', 'version']
        filepath_data = dict(zip(filepath_data_keys, raw_filepath_data))
        filepath_data['size'] = to_int(filepath_data['size'], default_if_fail=0)
        if filepath_data.get('version') == 'folder':
            #is_dir = True
            is_image = False
            is_markdown = False
            data['folders'][filepath] = filepath_data
            if lower_filepath not in lower_folders:
                lower_folders.append(lower_filepath)
        else:
            #is_dir = False
            # prepare raw data ends
            is_image = is_a_image_file(filepath)
            is_markdown = is_a_markdown_file(filepath)
            data['files'][filepath] = filepath_data
            data['lower_files'][filepath.strip().lower()] = filepath_data
        lower_folder_path = os.path.split(filepath.strip().lower())[0]
        if lower_folder_path:
            parts = lower_folder_path.split('/')
            parts_length = len(parts)
            if parts_length > 10:
                continue
            for i in range(parts_length):
                one_lower_folder_path = '/'.join(parts[:i + 1])
                last_path_part = one_lower_folder_path.split('/')[-1]
                if last_path_part.startswith('_'):
                    continue
                if one_lower_folder_path not in lower_folders:
                    lower_folders.append(one_lower_folder_path)
                if one_lower_folder_path:
                    images_count_plus = 1 if is_image else 0
                    posts_count_plus = 1 if is_markdown else 0
                    _images_count_plus = 1 if images_count_plus and lower_folder_path == one_lower_folder_path else 0
                    _posts_count_plus = 1 if posts_count_plus and lower_folder_path == one_lower_folder_path else 0
                    matched_count = lower_folders_count.setdefault(one_lower_folder_path, {})
                    matched_count['images_count'] = matched_count.get('images_count', 0) + images_count_plus
                    matched_count['posts_count'] = matched_count.get('posts_count', 0) + posts_count_plus
                    matched_count['_images_count'] = matched_count.get('_images_count', 0) + _images_count_plus
                    matched_count['_posts_count'] = matched_count.get('_posts_count', 0) + _posts_count_plus
    data['lower_folders'] = lower_folders
    data['lower_folders_count'] = lower_folders_count

    data['date'] = time.time()
    return data
Example #9
0
def get_linked_docs_from_markdown_content(path,
                                          raw_content,
                                          md_link_abs_check_func=None):
    # return [unicode]
    if not raw_content:
        return [], []
    if not isinstance(raw_content, string_types):
        return [], []
    raw_content = smart_unicode(raw_content)

    # [xxx](/???.md)
    maybe_md_links = []
    for m in re.finditer("(?:(?<=^)|(?<!!))(\\[.*?\\])\\((.*?)\\)",
                         raw_content):
        link = m.group(2)
        if "://" in link:
            continue
        if "?" in link:
            link = link.split("?")[0]
        if "#" in link and not link.startswith("#"):
            link = link.split("#", 1)[0]
        link = link.strip()
        if is_a_markdown_file(link):  # here, must be a markdown file
            if not link in maybe_md_links:
                maybe_md_links.append(link)

    for m in re.finditer("(?<!\[)(\[\[)([^\[\]]+)(\]\])", raw_content):
        # [[ xxx ]]
        # [[ xxx | title ]]
        # [[ xxx | title # id ]]
        link = m.group(2)
        link, link_title, link_id = get_link_title_id_in_wiki_syntax(link)
        if "?" in link:
            link = link.split("?")[0]
        if "#" in link and not link.startswith("#"):
            link = link.split("#", 1)[0]
        if not link:
            continue
        link = link.strip()
        if link not in maybe_md_links:
            maybe_md_links.append(link)

    # 校验
    tags = []
    post_parent_path = path.strip("/").rsplit("/", 1)[0]
    link_paths = []
    for maybe_md_link in maybe_md_links:
        if maybe_md_link.startswith("#"):
            tag = maybe_md_link.lstrip("#")
            if tag not in tags:
                tags.append(tag)
            continue
        if not is_a_markdown_file(
                maybe_md_link):  # by default add .md ext to the link
            maybe_md_link += ".md"
        if maybe_md_link.startswith("/"):  # 相对于根目录下已经是完整的地址了
            link = maybe_md_link
        else:
            if md_link_abs_check_func and md_link_abs_check_func(
                    maybe_md_link):
                # 函数判断,可以省略了 /, 但此时又进行了补全
                link = "/%s" % maybe_md_link.strip("/")
            else:
                link = "%s/%s" % (post_parent_path, maybe_md_link.strip("/"))
        if not link:
            continue

        # 全小写化处理
        link = link.lower().lstrip("/")

        if link not in link_paths:
            link_paths.append(link)

    return link_paths, tags
Example #10
0
def append_to_markdown_doc_and_sync(bucket,
                                    path,
                                    content,
                                    lines_to_append=1,
                                    reverse=False,
                                    do_not_repeat=True,
                                    lines_more_diff=None,
                                    draft_by_default=False):
    # 默认检测 append 的内容是否重复
    if not bucket or not path or not content:
        return
    if not isinstance(bucket, string_types) or not isinstance(
            path, string_types) or not isinstance(content, string_types):
        return
    if not has_bucket(bucket):
        return
    if not is_a_markdown_file(path):
        return

    content = smart_unicode(content)

    old_doc = get_record_by_path(bucket, path=path) or {}
    if not isinstance(old_doc, dict):
        old_doc = {}

    if lines_more_diff:  # 多长时间hi后,自动多一空行
        if old_doc and old_doc.get('timestamp'):
            try:
                diff = time.time() - old_doc.get('timestamp')
                if diff > lines_more_diff:
                    lines_to_append += 1
            except:
                pass

    interval = '\r\n' * abs(to_int(lines_to_append, max_value=10))  # 间隔换行

    if old_doc:
        if get_type_from_record(old_doc) == 'post':  # 目前仅支持日志类型文件的append
            old_content = old_doc.get('raw_content')
            if old_content == " ":
                old_content = ""
            if do_not_repeat:
                if reverse:
                    if old_content.strip().startswith(content.strip()):
                        return ""
                else:
                    old_content_s = old_content.strip()
                    appended_content_s = content.strip()
                    if old_content_s.endswith(
                            '\n' + appended_content_s
                    ) or old_content_s == appended_content_s:
                        return ''  # ignore, 重复的内容不处理
            if reverse:  # 插入头部位置
                new_content = '%s%s' % (content, interval)
                content = '%s%s' % (new_content, old_content)
            else:
                new_content = '%s%s' % (interval, content)
                content = '%s%s' % (old_content, new_content)
        else:
            return
    else:  # new doc
        content = content.strip()
        if draft_by_default:
            # 新建文档默认是 draft 的状态
            if re.match(u"\w+[:\uff1a]", content):  # 可能用户自己声明了 metadata
                content = "status: draft\n%s" % content
            else:
                now = get_now_from_bucket(bucket)
                content = "date: %s\nstatus: draft\n\n%s" % (now, content)

    sync_file_by_server_side(bucket=bucket,
                             relative_path=path,
                             content=content)

    return True  # done
Example #11
0
def update_post_tags_words_info(bucket, record_data):
    path = get_path_from_record(record_data)
    lower_path = path.lower().lstrip('/')
    if not path:
        return
    if not is_a_markdown_file(path):
        return
    if lower_path.startswith('_nav/'):
        return
    posts_info = get_bucket_posts_info(bucket) or {}
    # data init
    bucket_text_words = to_int(posts_info.get('text_words') or 0,
                               default_if_fail=0)

    # prepare tags info
    tags_info = posts_info.setdefault(
        'tags',
        {})  # {'paths':{path:[tag1,tag2]} ,  'tags': {'tag':[path1, path2]} }
    tags_info_tags = tags_info.setdefault('tags', {})
    tags_info_paths = tags_info.setdefault('paths', {})

    # prepare links info
    links_info = posts_info.setdefault(
        "links", {})  # {'paths': {path:[back_path1, back_path2]]} ,
    #   'back_paths': {'back_path':[path1, path2]} }
    links_info_links = links_info.setdefault("links", {})
    links_info_paths = links_info.setdefault("paths", {})

    words_info = posts_info.setdefault('words', {})  # {'path': text_words}

    is_deleted = record_data.get('is_deleted', False)
    post_status = record_data.get('status') or 'public'
    post_tags = record_data.get('tags') or []
    if not isinstance(post_tags, (list, tuple)):
        post_tags = []

    post_doc_links, wiki_tags = get_linked_docs_from_markdown_content(
        path,
        record_data.get("raw_content"),
        md_link_abs_check_func=partial(has_record_by_path, bucket))
    if not isinstance(post_doc_links, (list, tuple)):
        post_doc_links = []

    text_words = to_int(record_data.get('text_words'), default_if_fail=0)

    # 如果已有 words 的信息,先减去,避免重复计算
    old_text_words = to_int(words_info.get(lower_path), default_if_fail=0)
    if old_text_words:
        bucket_text_words -= old_text_words

    # deleted, 草稿类似的状态,不进入计算;去除相关的信息
    if is_deleted or post_status in ['draft', 'private']:
        words_info.pop(lower_path, None)

        # handle delete tags
        old_tags = tags_info_paths.get(lower_path)
        if not isinstance(old_tags, (list, tuple)):
            old_tags = []
        old_tags = [smart_unicode(tag) for tag in old_tags]
        tags_info_paths.pop(lower_path, None)
        for tag in old_tags:
            tags_info_tags_for_tag_paths = tags_info_tags.setdefault(tag, [])
            if lower_path in tags_info_tags_for_tag_paths:
                tags_info_tags_for_tag_paths.remove(lower_path)
                if not tags_info_tags_for_tag_paths:  # 空的 tags
                    tags_info_tags.pop(tag, None)

        # handle delete links
        old_links = links_info_paths.get(lower_path)
        if not isinstance(old_links, (list, tuple)):
            old_links = []
        old_links = [smart_unicode(link) for link in old_links]
        links_info_paths.pop(lower_path, None)
        for link in old_links:
            links_info_link_back_paths = links_info_links.setdefault(link, [])
            if lower_path in links_info_link_back_paths:
                links_info_link_back_paths.remove(lower_path)
                if not links_info_link_back_paths:  # 空的 links 了
                    links_info_links.pop(link, None)

    else:
        bucket_text_words += text_words
        words_info[lower_path] = text_words

        # handle tags
        if post_tags:
            tags_info_paths[lower_path] = post_tags
        for tag in post_tags:
            tags_info_tags_for_tag_paths = tags_info_tags.setdefault(tag, [])
            if lower_path not in tags_info_tags_for_tag_paths:
                tags_info_tags_for_tag_paths.append(lower_path)
        empty_tags = []
        for tag, paths_tagged in tags_info_tags.items():
            if not paths_tagged:
                empty_tags.append(tag)
                continue
            if not isinstance(paths_tagged, list):
                continue
            if lower_path in paths_tagged and tag not in post_tags:
                paths_tagged.remove(lower_path)
            if not paths_tagged:
                empty_tags.append(tag)
        for empty_tag in empty_tags:
            tags_info_tags.pop(empty_tag, None)

        # handle links
        if post_doc_links:
            links_info_paths[lower_path] = post_doc_links
        for link in post_doc_links:
            links_info_link_back_paths = links_info_links.setdefault(link, [])
            if lower_path not in links_info_link_back_paths:
                links_info_link_back_paths.append(lower_path)
        empty_links = []
        for link, paths_linked in links_info_links.items():
            if not paths_linked:
                empty_links.append(link)
                continue
            if not isinstance(paths_linked, list):
                continue
            if lower_path in paths_linked and link not in post_doc_links:
                paths_linked.remove(lower_path)
            if not paths_linked:
                empty_links.append(link)
        for empty_link in empty_links:
            links_info_links.pop(empty_link, None)

    if bucket_text_words < 0:
        bucket_text_words = 0

    posts_info['text_words'] = bucket_text_words

    set_bucket_configs(bucket, configs=posts_info, config_type='posts')