def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping id_path = "./{%(ns)s}id" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text articleID = elem.find(id_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks, articleID else: return title, sections, articleID
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0',) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [filter_wiki(section_content) for section_content in section_contents] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks else: return title, sections
def build_dict(N): tuple = extract_pages("enwiki-20181220-pages-articles-multistream.xml") page_dict = {} elect_id = random.randint(1, 500) id = 0 cnt = 1 for t in tuple: if (cnt > N): break id += 1 if (id == elect_id): title = t[0] interlinks = find_interlinks(str(t)) outlink_num = len(interlinks) page_dict[title] = [cnt, outlink_num, list(interlinks.keys())] cnt += 1 elect_id += random.randint(1, 150) return page_dict
def PreProcessing(): print('begin time of the program is: ', time.ctime()) tuPle = WIKI.extract_pages(xml_path) global tot_word #read the xml file into the tuple which is read as type yield cnt_time = 0 while cnt_time < tot_number: curr_page = next(tuPle) redirects = [ redirect for keyword, redirect in WIKI.find_interlinks( curr_page[1]).items() ] cnt_time += 1 # extract the title and the redirect title curr_title = curr_page[0] if curr_title not in WordDict: WordDict[curr_title] = tot_word NumDict[tot_word] = curr_title tot_word += 1 org_id = WordDict[curr_title] # set the id of the word # sum_redirect = len(redirects) for redirect_title in redirects: if redirect_title not in WordDict: WordDict[redirect_title] = tot_word NumDict[tot_word] = redirect_title #link_id = WordDict[redirect_title] tot_word += 1 link_id = WordDict[redirect_title] if org_id not in OutLink: OutLink[org_id] = [] OutLink[org_id].append(link_id) if link_id not in InLink: InLink[link_id] = [] InLink[link_id].append(org_id) #addtwodimdict(RankScore,org_id,link_id,1/sum_redirect) print('end time of the pre-processing is: ', time.ctime())
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) list of (str, str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) [(interlink_article, interlink_text), ...]). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: coordinates = extract_coordinates(text) if len(coordinates) > 2: latitude = ( coordinates[0] + coordinates[1] / 60.0 + coordinates[2] / 3600.00) * SIGNS[coordinates[3].upper()] longitude = ( coordinates[4] + coordinates[5] / 60.0 + coordinates[6] / 3600.00) * SIGNS[coordinates[7].upper()] coordinates = (latitude, longitude) if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] coordinates = None section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, coordinates, interlinks else: return title, sections, coordinates