def run(loc, out_loc):
    data = {}
    linked_page_ids = set()
    total_results = 0
    page_counter = 0
    out_writer = open(out_loc, "w")
    seen_hashed_ids = set()
    stats = Counter()
    with open(loc, 'rb') as f:
        for page in iter_annotations(f):
            page_counter += 1
            p = page
            examples = extract_section_links(p, stats)
            if page_counter % 1000 == 0:
                print(stats)
                print("Pages Parse / Example Created: {} / {}".format(page_counter, total_results))
            parse(examples, data, linked_page_ids)
            total_results += len(examples)

    with open(loc, 'rb') as f:
        page_counter = 0
        for page in iter_pages(f):
            page_counter += 1
            # if page_counter % 1000 == 0:
            #     print(stats)
            #     print("Pages Parse / Example Created: {} / {}".format(page_counter, total_results))
            get_examples_with_path(out_writer, page, data, linked_page_ids, seen_hashed_ids, stats)


    out_writer.close()
Beispiel #2
0
def build_q(read_path=Wiki.file_path_list[0],
            write_path=write_q_path,
            page_limit=1):

    print('*** reading {} articles from file: {} ***'.format(
        page_limit, read_path))
    with open(read_path, 'rb') as f_read:
        with open(write_path, 'w') as f_write:

            counter = 1

            for p in iter_pages(f_read):

                if counter % 1000 == 0:
                    print('{} / {} of pages processed'.format(
                        counter, page_limit))

                page_name = p.page_name

                q_list = [
                    page_name + " " +
                    " ".join([str(section.heading) for section in sectionpath])
                    for sectionpath in p.flat_headings_list()
                ]

                for q in q_list:

                    f_write.write(p.page_id + "\t" + q + "\n")

                if counter >= page_limit:
                    break

                counter += 1
 def process_article(self, article_file):
     with open(article_file, 'rb') as f:
         for p in iter_pages(f):
             if len(p.outline()) > 0:
                 headings_para_dict = {}
                 sections_list = p.flat_headings_list()
                 for section_path in sections_list:
                     # section path gives hierarchy of the sections as a list
                     # returns section_path[0] / section_path[1]...
                     list1 = [
                         " / ".join(
                             [section.heading for section in section_path])
                     ]
                     para_children = []
                     # taking last child (the lowest in hierarchy heading)
                     for cb in section_path[-1].children:
                         if hasattr(cb, 'paragraph'):
                             para_children.append(' '.join([
                                 c.text if isinstance(c, ParaText) else
                                 c.anchor_text for c in cb.paragraph.bodies
                             ]))
                         elif hasattr(cb, 'body'):
                             para_children.append(' '.join([
                                 c.text if isinstance(c, ParaText) else
                                 c.anchor_text for c in cb.body.bodies
                             ]))
                     section = [(p.page_name + " / ") + l for l in list1]
                     headings_para_dict[section[0]] = ' '.join(
                         para_children)
     return headings_para_dict
Beispiel #4
0
    def retrieve_ordinal_map(self):
        ordinal_map = {}
        context_map = {}
        with open(self.outline_loc, 'rb') as f:
            for page in iter_pages(f):
                contexts = self.retrieve_page_context(page)
                context_map[page.page_id] = contexts
                flattened = self.flatten(page.skeleton)
                ordinal_map[page.page_id] = dict([
                    (i, idx) for idx, i in enumerate(flattened)
                ])

        return ordinal_map, context_map
Beispiel #5
0
def print_articles(path=Wiki.file_path_list[5], limit=1):

    print('*** reading {} articles from file: {} ***'.format(limit, path))
    with open(path, 'rb') as f:

        counter = 1

        for p in iter_pages(f):

            print('*** PRINTING PAGE {} ***'.format(counter))
            print(
                'PageType  -> ArticlePage | CategoryPage | RedirectPage ParaLink | DisambiguationPage'
            )

            print('----------------------- RAW PAGE  -----------------------')

            print(p)

            print('----------------------- INFO  -----------------------')
            print('page_name:', p.page_name)
            print('page_id:', p.page_id)
            print('page_type:', p.page_type)

            print('-----------------------  METADATA  -----------------------')

            print(
                'PageMetadata -> RedirectNames DisambiguationNames DisambiguationIds CategoryNames CategoryIds InlinkIds InlinkAnchors'
            )
            print('''
            RedirectNames       -> [$pageName] 
            DisambiguationNames -> [$pageName] 
            DisambiguationIds   -> [$pageId] 
            CategoryNames       -> [$pageName] 
            CategoryIds         -> [$pageId] 
            InlinkIds           -> [$pageId] 
            InlinkAnchors       -> [$anchorText])
            ''')

            print('page_meta:', p.page_meta)

            print('----------------------- SKELTON  -----------------------')

            print('''
            Section      -> $sectionHeading [PageSkeleton]
            Para         -> Paragraph
            Paragraph    -> $paragraphId, [ParaBody]
            ListItem     -> $nestingLevel, Paragraph
            Image        -> $imageURL [PageSkeleton]
            ParaBody     -> ParaText | ParaLink
            ParaText     -> $text
            ParaLink     -> $targetPage $targetPageId $linkSection $anchorText
            ''')
            print(p.skeleton)

            print(
                '----------------------- HEADINGS RAW  -----------------------'
            )
            # get one data structure with nested (heading, [children]) pairs
            headings = p.nested_headings()
            print(headings)

            print(
                '----------------------- HEADINGS UNPACKED  -----------------------'
            )
            if len(p.outline()) > 0:
                print(p.outline()[0].__str__())

                print('deep headings= ',
                      [(str(section.heading), len(children))
                       for (section, children) in p.deep_headings_list()])

                print('flat headings= ', [
                    "/".join([str(section.heading) for section in sectionpath])
                    for sectionpath in p.flat_headings_list()
                ])

            if counter >= limit:
                break

            counter += 1
Beispiel #6
0
import spacy
import csv
import os
import time
from collections import Counter
from spacy import displacy

csvfile = str(os.path.dirname(os.path.dirname(
    os.getcwd()))) + '/data/character-deaths.csv'
names = []
namelist = []

f = open(
    str(os.path.dirname(os.path.dirname(os.getcwd()))) + '/data/got.cbor',
    'rb')
pages = list(dat.iter_pages(f))
f.close()


#Replaces the names in the name list with the redirect names for characters who have a redirect link to another page in the cbor
def Rename_Redirect(namelist):
    file = open(
        str(os.path.dirname(os.path.dirname(os.getcwd()))) + '/data/names.txt',
        'w')
    file.write("Name\n")
    for n in namelist:
        flag = True
        if n == "Victarion Greyjoy":
            file.write("House Greyjoy\n")
            continue
        for page in pages:
def eval_main() -> None:
    parsed = get_parser()
    outlines_cbor_file = parsed["outline_cbor"]  # type: str
    run_dir = parsed["run_directory"]  # type: Optional[str]
    run_file = parsed["run_file"]  # type: Optional[str]
    qrels_file = parsed["qrels"]  # type: Optional[str]
    compat_file = parsed["compat"]  # type: Optional[str]
    max_possible_relevance = parsed["max_relevance"]  # type:int
    gold_pages_file = parsed["gold_pages"]  # type: str
    if not run_file and not run_dir:
        print("Error: Either run_file or run_dir must be given.",
              file=sys.stderr)
        sys.exit(1)

    eval_data = dict()  # type: Dict[str, List[PageEval]] # runName
    relevance_cache = dict()  # type: Dict[str, PageRelevanceCache]

    compat_y2_to_y3 = {
        entry.y2SectionId: entry.sectionId
        for entry in load_compat_file(compat_file)
    } if compat_file else None
    # compat_y3_to_y2 = [(entry.sectionId, entry.y2SectionId) for entry in load_compat_file(compat_file)]

    if qrels_file:
        qrel_data = QrelFile(qrels_file, qid_translation_map=compat_y2_to_y3)
        qrel_max_possible_relevance = qrel_data.max_possible_relevance()
    else:
        qrel_data = None
        qrel_max_possible_relevance = 1

    with open(outlines_cbor_file, 'rb') as f:
        for page in OutlineReader.initialize_pages(f):
            relevance_cache[page.squid] = PageRelevanceCache(
                page,
                max_possible_relevance=max_possible_relevance
                if max_possible_relevance else qrel_max_possible_relevance)

    num_pages = len(relevance_cache)

    if qrels_file:
        qrels_by_squid = qrel_data.group_by_squid(relevance_cache.keys())
        for squid, qrel_lines in qrels_by_squid.items():
            pageCache = relevance_cache[squid]
            for qline in qrel_lines:
                pageCache.add_paragraph_facet(qid=qline.qid,
                                              para_id=qline.doc_id,
                                              relevance=qline.relevance)
    else:
        print("No qrels file given, won't produce relevance and facet scores.",
              file=sys.stderr)

    if gold_pages_file:
        with open(gold_pages_file, 'rb') as gold_pages_handle:
            for goldpage in iter_pages(gold_pages_handle):
                gold_paragraph_sequence = flat_paragraphs(goldpage)
                relevance_cache[goldpage.page_id].set_paragraph_position_list(
                    zip(gold_paragraph_sequence,
                        range(1, len(gold_paragraph_sequence))))
    else:
        print("No gold-pages file given, won't produce transition scores.",
              file=sys.stderr)

    # todo rundir
    if run_file:
        score_run(eval_data, relevance_cache, run_file)
    if run_dir:
        for f in os.listdir(run_dir):
            if f.endswith(".jsonl"):
                score_run(eval_data, relevance_cache, run_dir + os.sep + f)

    # for name, evals in eval_data.items():
    #     for metric, evals_ in safe_group_by([(eval.metric, eval) for eval in evals]).items():
    #         print_eval_line(evals_, metric, name, num_pages)

    all_eval_lines = [elem for list in eval_data.values() for elem in list]
    for metric, evals_ in safe_group_by([(eval.metric, eval)
                                         for eval in all_eval_lines]).items():
        for name, evals in safe_group_by([(eval.run_id, eval)
                                          for eval in evals_]).items():
            print_eval_line(evals, metric, name, num_pages)
    def extract_info(self):
        # read cbor file
        f = open("../data/got.cbor", "rb")
        self.pages = list(read_data.iter_pages(f))
        f.close()

        self.names_list = pd.concat([self.training_names, self.test_names])

        # training_names = self.training_names
        # test_names = self.test_names
        # name_list = self.names_list
        # name_list = ['jon snow', 'arya stark']
        # dictionary of page_names(page title) -> page_id
        page_name_dict = {}
        # Parsing the whole wiki as inverted index. Every sentense
        # is indexed with the page_id. We only have one wiki, hense, one
        # document exist in the inverted index.
        page_name_index = Index()
        # dictionary of redirect names, inlinks and disambiguition names
        # in the wiki to the page_name
        name_dict_base = {}

        id = 0
        for page in self.pages:
            # print(id, page.page_name)
            my_page_name = (page.page_name.replace('(', '').replace(
                ')', '').replace('[', '').replace(']', '').lower())

            extra_info = ''
            page_name_dict[my_page_name] = id

            if name_dict_base.get(my_page_name) == None:
                name_dict_base[my_page_name] = [my_page_name]

            # else:
            #     name_dict_base[my_page_name].append(my_page_name)

            if page.page_meta.inlinkAnchors != None:
                for inlink in page.page_meta.inlinkAnchors:
                    name_dict_base[my_page_name].append(inlink[0])
                    extra_info = ' '.join([extra_info, inlink[0]])
                    if name_dict_base.get(inlink[0]) == None:
                        name_dict_base[inlink[0]] = [my_page_name]
                    else:
                        name_dict_base[inlink[0]].append(my_page_name)

            if page.page_meta.redirectNames != None:
                for redirect_name in page.page_meta.redirectNames:
                    name_dict_base[my_page_name].append(redirect_name)
                    extra_info = ' '.join([extra_info, redirect_name])
                    if name_dict_base.get(redirect_name) == None:
                        name_dict_base[redirect_name] = [my_page_name]
                    else:
                        name_dict_base[redirect_name].append(my_page_name)

            if page.page_meta.disambiguationNames != None:
                for dis_name in page.page_meta.disambiguationNames:
                    name_dict_base[my_page_name].append(dis_name)
                    extra_info = ' '.join([extra_info, dis_name])
                    if name_dict_base.get(dis_name) == None:
                        name_dict_base[dis_name] = [my_page_name]
                    else:
                        name_dict_base[dis_name].append(my_page_name)

            page_name_index.add_sentence(' '.join([my_page_name, extra_info]))
            # page_name_index.add_sentence(my_page_name)

            id += 1

        page_name_index.finish_adding_sentences()
        self.pages_inverted_index = page_name_index
        self.page_name_dict = page_name_dict