def filter_general_ens(lines):
    """Returns tab-separated lines: qid score   en  men fb_id"""
    filtered_annots = []
    for line in lines:
        dbp_uri = WikipediaUtils.wiki_uri_to_dbp_uri(line[2])
        if dbp_uri in KB_SNP_DBP:  # check fb is in the KB snapshot
            filtered_annots.append(line)
    return filtered_annots
 def ask_title(self, page_id):
     """Sends page id to the API and get the page title."""
     if page_id not in self.id_title_dict:
         req = "?id=" + str(page_id) + "&title-only=true"
         res = requests.get(self.DESC_DEXTER_URI + req).json()
         title = res.get('title', "")
         wiki_uri = WikipediaUtils.wiki_title_to_uri(title.encode("utf-8"))
         self.id_title_dict[page_id] = wiki_uri
     return self.id_title_dict[page_id]
 def ask_title(self, page_id):
     """Sends page id to the API and get the page title."""
     if page_id not in self.id_title_dict:
         req = "?id=" + str(page_id) + "&title-only=true"
         res = requests.get(self.DESC_DEXTER_URI + req).json()
         title = res.get('title', "")
         wiki_uri = WikipediaUtils.wiki_title_to_uri(title.encode("utf-8"))
         self.id_title_dict[page_id] = wiki_uri
     return self.id_title_dict[page_id]
Beispiel #4
0
 def add_redirects(self, redirect_file):
     """Adds redirect pages to the surface form dictionary."""
     print "Adding redirects ..."
     redirects = open(redirect_file, "r")
     count = 0
     for line in redirects:
         cols = line.strip().split("\t")
         sf = cols[0].strip().lower()
         wiki_uri = WikipediaUtils.wiki_title_to_uri(cols[1].strip())
         # print sf, wiki_uri
         self.__add_to_dict(sf, "redirect", wiki_uri)
         count += 1
         if count % 1000000 == 0:
             print "Processed ", count, "th redirects."
 def add_redirects(self, redirect_file):
     """Adds redirect pages to the surface form dictionary."""
     print "Adding redirects ..."
     redirects = open(redirect_file, "r")
     count = 0
     for line in redirects:
         cols = line.strip().split("\t")
         sf = cols[0].strip().lower()
         wiki_uri = WikipediaUtils.wiki_title_to_uri(cols[1].strip())
         # print sf, wiki_uri
         self.__add_to_dict(sf, "redirect", wiki_uri)
         count += 1
         if count % 1000000 == 0:
             print "Processed ", count, "th redirects."
 def add_anchors(self, anchor_file):
     print "Adding anchors ..."
     i = 0
     infile = open(anchor_file, "r")
     for line in infile:
         # print line
         cols = line.strip().split("\t")
         sf = cols[0].strip()
         count = int(cols[2])
         wiki_uri = WikipediaUtils.wiki_title_to_uri(unquote(cols[1].strip()))
         self.__add_to_dict(sf, "anchor", wiki_uri, count)
         i += 1
         if i % 1000000 == 0:
             print "Processed", i, "th anchor!"
Beispiel #7
0
 def add_anchors(self, anchor_file):
     print "Adding anchors ..."
     i = 0
     infile = open(anchor_file, "r")
     for line in infile:
         # print line
         cols = line.strip().split("\t")
         sf = cols[0].strip()
         count = int(cols[2])
         wiki_uri = WikipediaUtils.wiki_title_to_uri(
             unquote(cols[1].strip()))
         self.__add_to_dict(sf, "anchor", wiki_uri, count)
         i += 1
         if i % 1000000 == 0:
             print "Processed", i, "th anchor!"
Beispiel #8
0
 def add_titles(self, title_file):
     """Adds titles and title name variants to the surface form dictionary."""
     print "Adding titles ..."
     redirects = open(title_file, "r")
     count = 0
     for line in redirects:
         cols = line.strip().split("\t")
         title = unquote(cols[1].strip())
         wiki_uri = WikipediaUtils.wiki_title_to_uri(title)
         self.__add_to_dict(title.lower(), "title", wiki_uri)
         title_nv = self.__title_nv(title)
         if (title_nv != title) and (title_nv.strip() != ""):
             self.__add_to_dict(title_nv.lower(), "title-nv", wiki_uri)
         count += 1
         if count % 1000000 == 0:
             print "Processed ", count, "th titles."
 def add_titles(self, title_file):
     """Adds titles and title name variants to the surface form dictionary."""
     print "Adding titles ..."
     redirects = open(title_file, "r")
     count = 0
     for line in redirects:
         cols = line.strip().split("\t")
         title = unquote(cols[1].strip())
         wiki_uri = WikipediaUtils.wiki_title_to_uri(title)
         self.__add_to_dict(title.lower(), "title", wiki_uri)
         title_nv = self.__title_nv(title)
         if (title_nv != title) and (title_nv.strip() != ""):
             self.__add_to_dict(title_nv.lower(), "title-nv", wiki_uri)
         count += 1
         if count % 1000000 == 0:
             print "Processed ", count, "th titles."
Beispiel #10
0
    def index_file(self, file_name):
        """
        Adds one file to the index.

        :param file_name: file to be indexed
        """
        self.contents = []
        article_text = ""
        article_annots = []  # for annot-only index

        f = open(file_name, "r")
        for line in f:
            line = line.replace("#redirect", "")
            # ------ Reaches the end tag for an article ---------
            if re.search(r'</doc>', line):
                # ignores null titles
                if wiki_uri is None:
                    print "\tINFO: Null Wikipedia title!"
                # ignores disambiguation pages
                elif (wiki_uri.endswith("(disambiguation)>")) or \
                        ((len(article_text) < 200) and ("may refer to:" in article_text)):
                    print "\tINFO: disambiguation page " + wiki_uri + " ignored!"
                # ignores list pages
                elif (wiki_uri.startswith("<wikipedia:List_of")) or (
                        wiki_uri.startswith("<wikipedia:Table_of")):
                    print "\tINFO: List page " + wiki_uri + " ignored!"
                # adds the document to the index
                else:
                    self.__add_to_contents(Lucene.FIELDNAME_ID, wiki_uri,
                                           Lucene.FIELDTYPE_ID)
                    if self.annot_only:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS,
                                               article_annots,
                                               Lucene.FIELDTYPE_ID_TV)
                    else:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS,
                                               article_text,
                                               Lucene.FIELDTYPE_TEXT_TVP)
                    self.lucene.add_document(self.contents)
                self.contents = []
                article_text = ""
                article_annots = []

            # ------ Process other lines of article ---------
            tag_iter = list(self.tagRE.finditer(line))
            # adds line to content if there is no annotation
            if len(tag_iter) == 0:
                article_text += line
                continue
            # A tag is detected in the line
            for t in tag_iter:
                tag = t.group(3)
                if tag == "doc":
                    doc_title = self.titleRE.search(t.group(2))
                    wiki_uri = WikipediaUtils.wiki_title_to_uri(
                        doc_title.group(1)) if doc_title else None
                if tag == "a":
                    article_text += t.group(1) + t.group(
                        4
                    )  # resolves annotations and replace them with mention
                    # extracts only annotations
                    if self.annot_only:
                        link_title = self.linkRE.search(t.group(2))
                        link_uri = WikipediaUtils.wiki_title_to_uri(
                            unquote(
                                link_title.group(1))) if link_title else None
                        if link_uri is not None:
                            article_annots.append(link_uri)
                        else:
                            print "\nINFO: link to the annotation not found in " + file_name
            last_span = tag_iter[-1].span()
            article_text += line[last_span[1]:]
        f.close()
 def __get_uri_from_title(self, title):
     return WikipediaUtils.wiki_title_to_uri(title) if title != TagmeAPI.NONE else TagmeAPI.NONE
 def __get_uri_from_title(self, title):
     return WikipediaUtils.wiki_title_to_uri(
         title) if title != TagmeAPI.NONE else TagmeAPI.NONE
    def index_file(self, file_name):
        """
        Adds one file to the index.

        :param file_name: file to be indexed
        """
        self.contents = []
        article_text = ""
        article_annots = []  # for annot-only index

        f = open(file_name, "r")
        for line in f:
            line = line.replace("#redirect", "")
            # ------ Reaches the end tag for an article ---------
            if re.search(r'</doc>', line):
                # ignores null titles
                if wiki_uri is None:
                    print "\tINFO: Null Wikipedia title!"
                # ignores disambiguation pages
                elif (wiki_uri.endswith("(disambiguation)>")) or \
                        ((len(article_text) < 200) and ("may refer to:" in article_text)):
                    print "\tINFO: disambiguation page " + wiki_uri + " ignored!"
                # ignores list pages
                elif (wiki_uri.startswith("<wikipedia:List_of")) or (wiki_uri.startswith("<wikipedia:Table_of")):
                    print "\tINFO: List page " + wiki_uri + " ignored!"
                # adds the document to the index
                else:
                    self.__add_to_contents(Lucene.FIELDNAME_ID, wiki_uri, Lucene.FIELDTYPE_ID)
                    if self.annot_only:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_annots, Lucene.FIELDTYPE_ID_TV)
                    else:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_text, Lucene.FIELDTYPE_TEXT_TVP)
                    self.lucene.add_document(self.contents)
                self.contents = []
                article_text = ""
                article_annots = []

            # ------ Process other lines of article ---------
            tag_iter = list(self.tagRE.finditer(line))
            # adds line to content if there is no annotation
            if len(tag_iter) == 0:
                article_text += line
                continue
            # A tag is detected in the line
            for t in tag_iter:
                tag = t.group(3)
                if tag == "doc":
                    doc_title = self.titleRE.search(t.group(2))
                    wiki_uri = WikipediaUtils.wiki_title_to_uri(doc_title.group(1)) if doc_title else None
                if tag == "a":
                    article_text += t.group(1) + t.group(4)  # resolves annotations and replace them with mention
                    # extracts only annotations
                    if self.annot_only:
                        link_title = self.linkRE.search(t.group(2))
                        link_uri = WikipediaUtils.wiki_title_to_uri(unquote(link_title.group(1))) if link_title else None
                        if link_uri is not None:
                            article_annots.append(link_uri)
                        else:
                            print "\nINFO: link to the annotation not found in " + file_name
            last_span = tag_iter[-1].span()
            article_text += line[last_span[1]:]
        f.close()