def __init__(self, row, isUtf8=False): self.row = row assert not fInvalidRedirect(row) self.md5Hash = None txt = row[CUR_TEXT] if isUtf8: title = utf8ToLatin1(self.row[CUR_TITLE]) self.row[CUR_TITLE] = entities.convertNumberedEntities(title, title) txt = utf8ToLatin1(txt) # redirectNum = int(row[CUR_IS_REDIRECT]) # assert redirectNum==0 or redirectNum==1 self.redirect = None # we pretty much ignore the CUR_IS_REDIRECT setting # if it's marked as redirect and is not, we throw it out # before coming here # if it's not marked as redirect but looks like redirect # we treat it as redirect anyway redirect = getRedirectFromText(txt) if redirect: self.redirect = entities.convertNumberedEntities(self.row[CUR_TITLE], redirect.replace(" ", "_")) if int(row[CUR_IS_REDIRECT]) == 0: # redirect not marked as such print "%s is a redirect but not marked as such" % self.getTitle() else: self.row[CUR_TEXT] = txt.strip()
def __init__(self, row, isUtf8=False): self.row = row assert not fInvalidRedirect(row) self.md5Hash = None txt = row[CUR_TEXT] if isUtf8: title = utf8ToLatin1(self.row[CUR_TITLE]) self.row[CUR_TITLE] = entities.convertNumberedEntities( title, title) txt = utf8ToLatin1(txt) #redirectNum = int(row[CUR_IS_REDIRECT]) #assert redirectNum==0 or redirectNum==1 self.redirect = None # we pretty much ignore the CUR_IS_REDIRECT setting # if it's marked as redirect and is not, we throw it out # before coming here # if it's not marked as redirect but looks like redirect # we treat it as redirect anyway redirect = getRedirectFromText(txt) if redirect: self.redirect = entities.convertNumberedEntities( self.row[CUR_TITLE], redirect.replace(" ", "_")) if int(row[CUR_IS_REDIRECT]) == 0: # redirect not marked as such print "%s is a redirect but not marked as such" % self.getTitle( ) else: self.row[CUR_TEXT] = txt.strip()
def parse411ReversePhoneLookup(htmlTxt): result = [] if -1 != htmlTxt.find("We did not find a listing"): print "parse411ReversePhoneLookup: no results" return (NO_RESULTS, None) soup = BeautifulSoup21.BeautifulSoup(htmlTxt) div = soup.first("div", {"id" : "subtext"}) if not div: print "parse411ReversePhoneLookup: no div" return (UNKNOWN_FORMAT, None) nameTxt = div.span.strong.string if not nameTxt: print "parse411ReversePhoneLookup: no name" return (UNKNOWN_FORMAT, None) nameTxt = convertNumberedEntities(nameTxt) nameTxt = convertNamedEntities(nameTxt) nameTxt = cleanupName(nameTxt) #print nameTxt addr = div.next.next.next.next.next.next addressTxt = addr.string #print addressTxt city = addr.next.next cityTxt = city.string #print cityTxt phone = city.next.next phoneTxt = phone.string #print phoneTxt result.append([nameTxt, addressTxt, cityTxt, phoneTxt]) return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
def personSearch2(htmlTxt): returned = [] soup = BeautifulSoup21.BeautifulSoup(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) divTag = soup.first("div", {"id" : "subtext"}) if not divTag: print "No divTag" return (UNKNOWN_FORMAT, None) #print "Found divTag" #print divTag #print #print divTag.span #print #print divTag.span.strong spanTag = divTag.span strongTag = divTag.span.strong nameStr = convertNumberedEntities(strongTag.string.strip()) #.strip() #.string.strip() print nameStr addrStreetTag = spanTag.nextSibling addrStreet = addrStreetTag.string.strip() print addrStreet addrCityTag = addrStreetTag.nextSibling.nextSibling addrCity = addrCityTag.string.strip() print addrCity addrPhoneTag = addrCityTag.nextSibling.nextSibling addrPhone = addrPhoneTag.string.strip() print addrPhone #print strongTag.nextSibling #print strongTag.nextSibling.nextSibling return (UNKNOWN_FORMAT, None)
def parse411ReversePhoneLookup(htmlTxt): result = [] if -1 != htmlTxt.find("We did not find a listing"): print "parse411ReversePhoneLookup: no results" return (NO_RESULTS, None) soup = BeautifulSoup21.BeautifulSoup(htmlTxt) div = soup.first("div", {"id": "subtext"}) if not div: print "parse411ReversePhoneLookup: no div" return (UNKNOWN_FORMAT, None) nameTxt = div.span.strong.string if not nameTxt: print "parse411ReversePhoneLookup: no name" return (UNKNOWN_FORMAT, None) nameTxt = convertNumberedEntities(nameTxt) nameTxt = convertNamedEntities(nameTxt) nameTxt = cleanupName(nameTxt) # print nameTxt addr = div.next.next.next.next.next.next addressTxt = addr.string # print addressTxt city = addr.next.next cityTxt = city.string # print cityTxt phone = city.next.next phoneTxt = phone.string # print phoneTxt result.append([nameTxt, addressTxt, cityTxt, phoneTxt]) return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
def personSearch2(htmlTxt): returned = [] soup = BeautifulSoup21.BeautifulSoup(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) divTag = soup.first("div", {"id": "subtext"}) if not divTag: print "No divTag" return (UNKNOWN_FORMAT, None) # print "Found divTag" # print divTag # print # print divTag.span # print # print divTag.span.strong spanTag = divTag.span strongTag = divTag.span.strong nameStr = convertNumberedEntities(strongTag.string.strip()) # .strip() #.string.strip() print nameStr addrStreetTag = spanTag.nextSibling addrStreet = addrStreetTag.string.strip() print addrStreet addrCityTag = addrStreetTag.nextSibling.nextSibling addrCity = addrCityTag.string.strip() print addrCity addrPhoneTag = addrCityTag.nextSibling.nextSibling addrPhone = addrPhoneTag.string.strip() print addrPhone # print strongTag.nextSibling # print strongTag.nextSibling.nextSibling return (UNKNOWN_FORMAT, None)
def universalDataFormatWithDefinition(definition, listOfLists): assert isinstance(listOfLists, list) header = ["%d" % (len(listOfLists) + 1)] serializedDefinition = definition.serialize() header.append("1 " + ("%d" % len(serializedDefinition))) results = ["D", serializedDefinition] for smallList in listOfLists: headerItem = [] for item in smallList: item = convertNumberedEntities(item) item = convertNamedEntities(item) results.append(item) headerItem.append("%d" % len(item)) header.append(string.join(headerItem, " ")) return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
def universalDataFormatReplaceEntities(listOfLists): assert isinstance(listOfLists, list) header = ["%d" % len(listOfLists)] results = [] for smallList in listOfLists: headerItem = [] for item in smallList: #remove entities item = convertNumberedEntities(item) item = convertNamedEntities(item) # add it to lists results.append(item) headerItem.append("%d" % len(item)) header.append(string.join(headerItem, " ")) return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
def universalDataFormatReplaceEntities(listOfLists): assert isinstance(listOfLists, list) header = ["%d" % len(listOfLists)] results = [] for smallList in listOfLists: headerItem = [] for item in smallList: # remove entities item = convertNumberedEntities(item) item = convertNamedEntities(item) # add it to lists results.append(item) headerItem.append("%d" % len(item)) header.append(string.join(headerItem, " ")) return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
def convertArticle(term, text): try: text = text.replace('__NOTOC__', '') text = fixSup2(text) text = removeImageRx(text) # remove categories. TODO: provide a better support for categories # i.e. we remember categories on the server and client can display # all articles in a given category # text=replaceRegExp(text, categoryRe, '') text = replaceWikiMacros(text) # remove remaining templates. TODO: better support for templates # in wikipedia template text is replaced by a page from Template: # namespace text = replaceRegExp(text, wikiTemplateRe, '') text = text.replace('\r', '') text = replaceRegExp( text, commentRe, '' ) # This should be safe, as it's illegal in html to nest comments text = stripTagBlocks(text, 'div') text = stripTagBlocks(text, 'table') text = stripBlocks(text, r'\{\|', r'\|\}') text = replaceRegExp(text, scriptRe, '') text = replaceTagList(text, ['b', 'strong'], "'''") text = replaceTagList(text, ['em', 'i', 'cite'], "''") text = replaceTagList(text, ['hr'], '----') text = replaceTagList(text, ['p'], '<br>') text = replaceTagList(text, [ 'dfn', 'code', 'samp', 'kbd', 'var', 'abbr', 'acronym', 'blockquote', 'q', 'pre', 'ins', 'del', 'dir', 'menu', 'img', 'object', 'big', 'span', 'applet', 'font', 'basefont', 'tr', 'td', 'table', 'center', 'div' ], '') text = replaceRegExp(text, badLinkRe, '', supportedLanguagesRe()) text = entities.convertNamedEntities(term, text) text = entities.convertNumberedEntities(term, text) text = stripMultipleNewLines(text) text = text.strip() text += '\n' return text except Exception, ex: print "Exception while converting term: ", term print arsutils.exceptionAsStr(ex) return ''
def convertArticle(term, text): try: text = text.replace("__NOTOC__", "") text = fixSup2(text) text = removeImageRx(text) # remove categories. TODO: provide a better support for categories # i.e. we remember categories on the server and client can display # all articles in a given category # text=replaceRegExp(text, categoryRe, '') text = replaceWikiMacros(text) # remove remaining templates. TODO: better support for templates # in wikipedia template text is replaced by a page from Template: # namespace text = replaceRegExp(text, wikiTemplateRe, "") text = text.replace("\r", "") text = replaceRegExp(text, commentRe, "") # This should be safe, as it's illegal in html to nest comments text = stripTagBlocks(text, "div") text = stripTagBlocks(text, "table") text = stripBlocks(text, r"\{\|", r"\|\}") text = replaceRegExp(text, scriptRe, "") text = replaceTagList(text, ["b", "strong"], "'''") text = replaceTagList(text, ["em", "i", "cite"], "''") text = replaceTagList(text, ["hr"], "----") text = replaceTagList(text, ["p"], "<br>") text = replaceTagList( text, [ "dfn", "code", "samp", "kbd", "var", "abbr", "acronym", "blockquote", "q", "pre", "ins", "del", "dir", "menu", "img", "object", "big", "span", "applet", "font", "basefont", "tr", "td", "table", "center", "div", ], "", ) text = replaceRegExp(text, badLinkRe, "", supportedLanguagesRe()) text = entities.convertNamedEntities(term, text) text = entities.convertNumberedEntities(term, text) text = stripMultipleNewLines(text) text = text.strip() text += "\n" return text except Exception, ex: print "Exception while converting term: ", term print arsutils.exceptionAsStr(ex) return ""