def get_url(entry: Entry): path = entry.get_path() major = Entry.normalize(path[1]) # cut off wiki root assert (major in wiki_pages) minor = path[-1] return create_wiki_link(major, minor)
def switch_context_by_level( self, title ): # go up until "title" is a child of working_node (concerning levels) while True: level_old = Entry.level_of_title(self.working_node.print_name) level_new = Entry.level_of_title(title) if level_old >= level_new: self.working_node = self.working_node.parent else: break
def extract_and_replace_links(text, wiki): links = [] # ms = re.findall(r"\[(.+?)\]\((.+?)\)", text) # for m in ms: # title = m.group(1) # url = m.group(2) # full = m.group(0) # if url.strip().startswith("#"): # links.append((title, url, full)) ms = re.findall(r"\[\[(.+?)(\|(.+?))?\]\]", text) for m in ms: full = m.group(0) url = m.group(1) title = m.group(3) or url links.append((title, url, full)) output = [] for title, url, full in links: alias = Entry.normalize(url) entry = wiki.find_child(alias) if entry is None: # couldn't resolve continue link = Link.from_entry(entry) output.append(link) text = text.replace(full, repr(link)) return text, output
def parse_comments(): repo.git.checkout("comments") files_comm = glob.glob(f"./{reddit_crawl.submodule_repo}/{thread_id}/*.md") files_wiki = glob.glob(f"./{reddit_crawl.submodule_repo}/wiki/*.md") wiki = Entry.create_wiki_root() wiki_parser.parse_entries_and_insert_with_overwrite( wiki, files_wiki, "wiki") wiki_parser.parse_entries_and_insert_with_overwrite( wiki, files_comm, "comment") wiki.sort_children_recursive() wiki_pages = wiki_parser.split_into_files(wiki) for page, node in wiki_pages.items(): with open(wiki_reader.file_wiki_page(page), "w") as file: if node is not None: content = node.to_string(short=False) else: content = "" file.write(content) repo.git.add(f'./wiki') repo.git.commit("--allow-empty", m=f"Parsed-{commit_msg}") print(repo.git.status()) return wiki
def parse_tree(self): flat = [] while True: e = self.parse_entry() if e is None: # no entry is coming break else: flat.append(e) root = Entry("root") # level 0 root branch = [root] while len(flat) > 0: node = flat.pop(0) level_new = node.level() while len(branch) > 1: # cut back the branch if necessary level_old = branch[-1].level() if level_new <= level_old: # cut branch = branch[:-1] else: # no cut break parent = branch[-1] parent.add_child(node) branch.append(node) return root
def parse_wiki(): wiki = Entry.create_wiki_root() for p in wiki_pages: content = fetch_wiki_page(p) WikiParser.take_and_parse(content.splitlines(), None, wiki) wiki.sort_children_recursive() return wiki
def split_into_files(wiki: Entry): wiki_pages = RedditWikiReader.wiki_pages topcats = {} wiki = wiki.copy() for page in wiki_pages: topcats[page] = wiki.find_child(page) if topcats[page] is not None: wiki.children.remove(topcats[page]) if len(wiki.children) > 0: topcats['misc'] = wiki else: topcats['misc'] = None return topcats
def end_entry(self): if self.entry is None or self.entry[ 'print_name'] is None: # shouldn't happen thanks to prev. null-checks raise RuntimeError("Don't have an entry to end!", self) content = "\n".join(self.entry['content']) content = Entry.format_content(content) info = self.entry['info'] print_name = self.entry['print_name'] child = Entry(print_name=print_name, aliases=None, info=info, content=content) self.working_node.merge_with(child, replace=True) self.mode = "entry-title" self.entry = None
def start_entry(self, title: str): node = self.tree.find_child(title) if node is None: node = Entry(print_name=title, info=None, content=None, aliases=None) self.working_node.add_child(node) self.working_node = node else: if Entry.level_of_title( title) != node.level(): # the two don't fit! raise RuntimeError( "Existing Node and New Node collide with different levels!", self) # switch to existing node self.working_node = node self.mode = "entry-info" self.entry = self.new_entry() self.entry['print_name'] = title self.entry['node'] = node
def attention_filter(lines): filtered = [] active = False for line in lines: cat = WikiParser.line_attention_start(line) end = WikiParser.line_attention_end(line) if cat is not None: cat = Entry.normalize(cat) active = True filtered.append("# " + cat) # top-level context switch elif end is True: assert (active is True) active = False else: if active: filtered.append(line) else: continue return filtered
def parse_entry(self): old_ptr = self.pointer self.skip_empty_lines() if self.ended(): self.pointer = old_ptr return None # can't parse, roll back name = self.line_title(self.ptr()) if name is None: self.pointer = old_ptr return None # can't parse, roll back else: self.inc() self.skip_empty_lines() if not self.ended(): info = self.line_entry_info(self.ptr()) if info is not None: self.inc() else: info = self.default_info(name) else: info = self.default_info(name) content = "" while not self.ended() and self.line_content(self.ptr()): content += self.ptr() + "\n" self.inc() entry = Entry(print_name=name, aliases=None, parent=None, children=None, info=info, content=content) return entry
def merge_into(dst: Entry, src: Entry): # jump over missing layers wherever possible opt = dst.find_children(src.print_name) if len(opt) == 1 and opt[0] != dst: # jump downwards the branch WikiParser.merge_into(opt[0], src) return elif len(opt) == 1 and opt[0] == dst: # we arrived dst.merge_with_no_recursion(src) for c in src.children: WikiParser.merge_into(dst, c) return elif len(opt) >= 2: # dst was illegal return RuntimeError else: # nothing found, let's make a new destination and use it trg = src.copy_no_children() dst.add_child(trg) WikiParser.merge_into(trg, src) return
from wiki import Entry import wiki_parser import reddit_crawl import glob # files = reddit_crawl.crawl_all(reddit_crawl.test_url, 8) files_comm = glob.glob("./data/fhftm0/*.md") files_wiki = glob.glob("./data/wiki/*.md") wiki = Entry.create_wiki_root() wiki_parser.parse_entries_and_insert_with_overwrite(wiki, files_wiki, "wiki") wiki_parser.parse_entries_and_insert_with_overwrite(wiki, files_comm, "comment") wiki_pages = wiki_parser.split_into_files(wiki)
def write_wiki(wiki: Entry): for p in wiki_pages: node = wiki.find_child(p) if node is None: continue write_wiki_page(p, repr(node))
def default_info(self, title): level = Entry.level_of_title(title) if level <= 2: # part of the standard tree return None else: return self.default_info_string
def get_player(self, name: str): for player in self.players: if Entry.normalize(player.name) == Entry.normalize(name): return player return None
def __init__(self, file): objs = json.loads(file) self.players = [Player(obj) for obj in objs] self.players.sort(key=lambda p: Entry.normalize(p.name))
def from_entry(entry: Entry): title = entry.reference_name() url = reddit_utils.get_url(entry) tooltip = Link.simplify_tooltip(entry.content) return Link(title, url, tooltip)