def listCategs(prefix): prefix = '%20'.join(prefix.split(' ')) url = BASEURL+'?action=query&format=xml&list=allcategories&acprefix='+prefix+'&aclimit=500' res = [] txt = urlMonkey.getURL(url) res.extend(re.findall('preserve\">(.*?)</c>', txt)) cont = re.search('query-continue><allcategories acfrom=\"(.*?)\"', txt) while (cont != None): txt = urlMonkey.getURL(url+'&acfrom='+'%20'.join(cont.group(1).split(' '))) res.extend(re.findall('preserve\">(.*?)</c>', txt)) cont = re.search('query-continue><allcategories acfrom=\"(.*?)\"', txt) res.extend(re.findall('preserve\">(.*?)</c>', txt)) return res
def search(title, base=BASEURL): title = re.sub(' ', '_', title) url = base+'?action=query&format=xml&list=search&srsearch='+title txt = urlMonkey.getURL(url) #print txt return re.findall('suggestion=\"(.+?)\"', txt) + re.findall('title=\"(.+?)\"', txt)
def queryInterface(title, depth=0): url = ''.join(['http://en.wikipedia.org/w/index.php?title=Special:Search&search=', '+'.join(cgi.escape(title).split(' '))]) txt = urlMonkey.getURL(url) try: wtitles = [re.search('<title>(.+?) - Wikipedia', txt).group(1)] didyoumean = [] except: return [] if 'search results' in wtitles[0].lower(): print 'hit search page' wtitles = re.findall('href=\"/wiki/(.+?)\"', txt) wtitles = filter(lambda x: ':' not in x and '#' not in x and x!='Main_Page', wtitles) didyoumean = re.search('Did you mean:.+?search=(.+?)[&\"]', txt) if didyoumean != None: print 'hit search suggestion' didyoumean = re.sub('\+', ' ', didyoumean.group(1)) if depth < 2: didyoumean = queryInterface(didyoumean, depth+1) else: return wtitles+[didyoumean] else: didyoumean = [] else: print 'hit direct match' return wtitles+didyoumean
def getCategories(title, base=BASEURL): title = reformat(title) url = base+'?action=query&format=xml&titles='+title+'&prop=categories&cllimit=45' txt = urlMonkey.getURL(url) return filter(lambda x: 'Redirects' not in x and 'All' not in x and 'Articles' not in x, re.findall('title=\"Category:(.+?)\"', txt))
def getLinkToArticle(title, base=BASEURL): title = reformat(title) try: title = getSearchSuggestions(title)[0] url = base+'?action=query&titles='+title+'&format=xml&prop=info&inprop=url&redirects' txt = urlMonkey.getURL(url) return re.search('fullurl=\"(.+?)\"', txt).group(1) except: return '#NA'
def isPerson(title): #title = reformat(title) title = '_'.join(title.split(' ')) url = BASEURL+'?action=query&format=xml&titles='+title+'&prop=categories&cllimit=45&redirects' txt = urlMonkey.getURL(url) cats = re.findall('title=\"Category:(.+?)\"', txt) res = False for c in map(lambda x: x.lower(), cats): res = (res or 'births' in c or 'deaths' in c or 'person' in c) return res
def getCatMembers(cat): if 'Category:' not in cat: cat = 'Category:' + cat cont = re.compile('cmcontinue=\"(.+?)\"') titles = re.compile('title=\"(.+?)\"') cat = re.sub(' ', '_', cat) url = (BASEURL+'?action=query&format=xml&list=categorymembers&cmtitle='+ cat+'&cmlimit=max') txt = urlMonkey.getURL(url) furl = url res = titles.findall(txt) m = cont.search(txt) while m != None: url = furl+'&cmcontinue='+m.group(1) txt = urlMonkey.getURL(url) res += titles.findall(txt) m = cont.search(txt) return res
def getText(title, base=BASEURL, rfmt=None, shelve_only=False): #returns text with specified wiki titles title = re.sub(' ', '_', title) #check local copy first try: txt = SHM.getArticleText(title) if txt != None: return txt except: pass if shelve_only: print 'miss', title return '' if rfmt != None: title = rfmt(title) url = base+'?action=query&titles='+title+'&format=xml&prop=revisions&rvprop=content&redirects' #print url in urlMonkey.UCH.cache_dict txt = urlMonkey.getURL(url) return txt
def bing_getURL(url): return urlMonkey.getURL(url, WIKI=False) '''
def getInfo(title): title = reformat(title) url = BASEURL+'?action=query&format=xml&titles='+title+'&prop=info&redirects' txt = urlMonkey.getURL(url) return re.findall('pageid=\"(.+?)\"', txt)
def getSearchSuggestions(title, base=BASEURL, limit=20): title=reformat(title) url = base+'?action=opensearch&search='+title+'&format=xml&llimit='+str(limit) txt = urlMonkey.getURL(url) return map(lambda x: reformat(x), re.findall('<Text.*?>(.*?)</Text>', txt))