Ejemplo n.º 1
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    id_path = "./{%(ns)s}id" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    articleID = elem.find(id_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks, articleID
    else:
        return title, sections, articleID
Ejemplo n.º 2
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0',)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [filter_wiki(section_content) for section_content in section_contents]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks
    else:
        return title, sections
def build_dict(N):
    tuple = extract_pages("enwiki-20181220-pages-articles-multistream.xml")
    page_dict = {}
    elect_id = random.randint(1, 500)
    id = 0
    cnt = 1
    for t in tuple:
        if (cnt > N):
            break
        id += 1
        if (id == elect_id):
            title = t[0]
            interlinks = find_interlinks(str(t))
            outlink_num = len(interlinks)
            page_dict[title] = [cnt, outlink_num, list(interlinks.keys())]
            cnt += 1
            elect_id += random.randint(1, 150)
    return page_dict
Ejemplo n.º 4
0
def PreProcessing():
    print('begin time of the program is: ', time.ctime())
    tuPle = WIKI.extract_pages(xml_path)
    global tot_word
    #read the xml file into the tuple which is read as type yield
    cnt_time = 0
    while cnt_time < tot_number:
        curr_page = next(tuPle)
        redirects = [
            redirect for keyword, redirect in WIKI.find_interlinks(
                curr_page[1]).items()
        ]
        cnt_time += 1
        # extract the title and the redirect title
        curr_title = curr_page[0]
        if curr_title not in WordDict:
            WordDict[curr_title] = tot_word
            NumDict[tot_word] = curr_title
            tot_word += 1
        org_id = WordDict[curr_title]
        # set the id of the word
        # sum_redirect = len(redirects)
        for redirect_title in redirects:

            if redirect_title not in WordDict:
                WordDict[redirect_title] = tot_word
                NumDict[tot_word] = redirect_title
                #link_id = WordDict[redirect_title]
                tot_word += 1
            link_id = WordDict[redirect_title]
            if org_id not in OutLink:
                OutLink[org_id] = []
            OutLink[org_id].append(link_id)
            if link_id not in InLink:
                InLink[link_id] = []
            InLink[link_id].append(org_id)
            #addtwodimdict(RankScore,org_id,link_id,1/sum_redirect)
    print('end time of the pre-processing is: ', time.ctime())
Ejemplo n.º 5
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag
    Parameters
    ----------
    page_xml : str
        Content from page tag.
    include_interlinks : bool
        Whether or not interlinks should be parsed.
    Returns
    -------
    (str, list of (str, str), (Optionally) list of (str, str))
        Structure contains (title, [(section_heading, section_content), ...],
        (Optionally) [(interlink_article, interlink_text), ...]).
    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        coordinates = extract_coordinates(text)
        if len(coordinates) > 2:
            latitude = (
                coordinates[0] + coordinates[1] / 60.0 +
                coordinates[2] / 3600.00) * SIGNS[coordinates[3].upper()]
            longitude = (
                coordinates[4] + coordinates[5] / 60.0 +
                coordinates[6] / 3600.00) * SIGNS[coordinates[7].upper()]
            coordinates = (latitude, longitude)
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []
        coordinates = None

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, coordinates, interlinks
    else:
        return title, sections, coordinates