def filter_general_ens(lines): """Returns tab-separated lines: qid score en men fb_id""" filtered_annots = [] for line in lines: dbp_uri = WikipediaUtils.wiki_uri_to_dbp_uri(line[2]) if dbp_uri in KB_SNP_DBP: # check fb is in the KB snapshot filtered_annots.append(line) return filtered_annots
def ask_title(self, page_id): """Sends page id to the API and get the page title.""" if page_id not in self.id_title_dict: req = "?id=" + str(page_id) + "&title-only=true" res = requests.get(self.DESC_DEXTER_URI + req).json() title = res.get('title', "") wiki_uri = WikipediaUtils.wiki_title_to_uri(title.encode("utf-8")) self.id_title_dict[page_id] = wiki_uri return self.id_title_dict[page_id]
def add_redirects(self, redirect_file): """Adds redirect pages to the surface form dictionary.""" print "Adding redirects ..." redirects = open(redirect_file, "r") count = 0 for line in redirects: cols = line.strip().split("\t") sf = cols[0].strip().lower() wiki_uri = WikipediaUtils.wiki_title_to_uri(cols[1].strip()) # print sf, wiki_uri self.__add_to_dict(sf, "redirect", wiki_uri) count += 1 if count % 1000000 == 0: print "Processed ", count, "th redirects."
def add_anchors(self, anchor_file): print "Adding anchors ..." i = 0 infile = open(anchor_file, "r") for line in infile: # print line cols = line.strip().split("\t") sf = cols[0].strip() count = int(cols[2]) wiki_uri = WikipediaUtils.wiki_title_to_uri(unquote(cols[1].strip())) self.__add_to_dict(sf, "anchor", wiki_uri, count) i += 1 if i % 1000000 == 0: print "Processed", i, "th anchor!"
def add_anchors(self, anchor_file): print "Adding anchors ..." i = 0 infile = open(anchor_file, "r") for line in infile: # print line cols = line.strip().split("\t") sf = cols[0].strip() count = int(cols[2]) wiki_uri = WikipediaUtils.wiki_title_to_uri( unquote(cols[1].strip())) self.__add_to_dict(sf, "anchor", wiki_uri, count) i += 1 if i % 1000000 == 0: print "Processed", i, "th anchor!"
def add_titles(self, title_file): """Adds titles and title name variants to the surface form dictionary.""" print "Adding titles ..." redirects = open(title_file, "r") count = 0 for line in redirects: cols = line.strip().split("\t") title = unquote(cols[1].strip()) wiki_uri = WikipediaUtils.wiki_title_to_uri(title) self.__add_to_dict(title.lower(), "title", wiki_uri) title_nv = self.__title_nv(title) if (title_nv != title) and (title_nv.strip() != ""): self.__add_to_dict(title_nv.lower(), "title-nv", wiki_uri) count += 1 if count % 1000000 == 0: print "Processed ", count, "th titles."
def index_file(self, file_name): """ Adds one file to the index. :param file_name: file to be indexed """ self.contents = [] article_text = "" article_annots = [] # for annot-only index f = open(file_name, "r") for line in f: line = line.replace("#redirect", "") # ------ Reaches the end tag for an article --------- if re.search(r'</doc>', line): # ignores null titles if wiki_uri is None: print "\tINFO: Null Wikipedia title!" # ignores disambiguation pages elif (wiki_uri.endswith("(disambiguation)>")) or \ ((len(article_text) < 200) and ("may refer to:" in article_text)): print "\tINFO: disambiguation page " + wiki_uri + " ignored!" # ignores list pages elif (wiki_uri.startswith("<wikipedia:List_of")) or ( wiki_uri.startswith("<wikipedia:Table_of")): print "\tINFO: List page " + wiki_uri + " ignored!" # adds the document to the index else: self.__add_to_contents(Lucene.FIELDNAME_ID, wiki_uri, Lucene.FIELDTYPE_ID) if self.annot_only: self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_annots, Lucene.FIELDTYPE_ID_TV) else: self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_text, Lucene.FIELDTYPE_TEXT_TVP) self.lucene.add_document(self.contents) self.contents = [] article_text = "" article_annots = [] # ------ Process other lines of article --------- tag_iter = list(self.tagRE.finditer(line)) # adds line to content if there is no annotation if len(tag_iter) == 0: article_text += line continue # A tag is detected in the line for t in tag_iter: tag = t.group(3) if tag == "doc": doc_title = self.titleRE.search(t.group(2)) wiki_uri = WikipediaUtils.wiki_title_to_uri( doc_title.group(1)) if doc_title else None if tag == "a": article_text += t.group(1) + t.group( 4 ) # resolves annotations and replace them with mention # extracts only annotations if self.annot_only: link_title = self.linkRE.search(t.group(2)) link_uri = WikipediaUtils.wiki_title_to_uri( unquote( link_title.group(1))) if link_title else None if link_uri is not None: article_annots.append(link_uri) else: print "\nINFO: link to the annotation not found in " + file_name last_span = tag_iter[-1].span() article_text += line[last_span[1]:] f.close()
def __get_uri_from_title(self, title): return WikipediaUtils.wiki_title_to_uri(title) if title != TagmeAPI.NONE else TagmeAPI.NONE
def __get_uri_from_title(self, title): return WikipediaUtils.wiki_title_to_uri( title) if title != TagmeAPI.NONE else TagmeAPI.NONE
def index_file(self, file_name): """ Adds one file to the index. :param file_name: file to be indexed """ self.contents = [] article_text = "" article_annots = [] # for annot-only index f = open(file_name, "r") for line in f: line = line.replace("#redirect", "") # ------ Reaches the end tag for an article --------- if re.search(r'</doc>', line): # ignores null titles if wiki_uri is None: print "\tINFO: Null Wikipedia title!" # ignores disambiguation pages elif (wiki_uri.endswith("(disambiguation)>")) or \ ((len(article_text) < 200) and ("may refer to:" in article_text)): print "\tINFO: disambiguation page " + wiki_uri + " ignored!" # ignores list pages elif (wiki_uri.startswith("<wikipedia:List_of")) or (wiki_uri.startswith("<wikipedia:Table_of")): print "\tINFO: List page " + wiki_uri + " ignored!" # adds the document to the index else: self.__add_to_contents(Lucene.FIELDNAME_ID, wiki_uri, Lucene.FIELDTYPE_ID) if self.annot_only: self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_annots, Lucene.FIELDTYPE_ID_TV) else: self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, article_text, Lucene.FIELDTYPE_TEXT_TVP) self.lucene.add_document(self.contents) self.contents = [] article_text = "" article_annots = [] # ------ Process other lines of article --------- tag_iter = list(self.tagRE.finditer(line)) # adds line to content if there is no annotation if len(tag_iter) == 0: article_text += line continue # A tag is detected in the line for t in tag_iter: tag = t.group(3) if tag == "doc": doc_title = self.titleRE.search(t.group(2)) wiki_uri = WikipediaUtils.wiki_title_to_uri(doc_title.group(1)) if doc_title else None if tag == "a": article_text += t.group(1) + t.group(4) # resolves annotations and replace them with mention # extracts only annotations if self.annot_only: link_title = self.linkRE.search(t.group(2)) link_uri = WikipediaUtils.wiki_title_to_uri(unquote(link_title.group(1))) if link_title else None if link_uri is not None: article_annots.append(link_uri) else: print "\nINFO: link to the annotation not found in " + file_name last_span = tag_iter[-1].span() article_text += line[last_span[1]:] f.close()