Example #1
0
def listCategs(prefix):
    prefix = '%20'.join(prefix.split(' '))
    url = BASEURL+'?action=query&format=xml&list=allcategories&acprefix='+prefix+'&aclimit=500'
    res = []
    
    txt = urlMonkey.getURL(url)
    res.extend(re.findall('preserve\">(.*?)</c>', txt))
    
    cont = re.search('query-continue><allcategories acfrom=\"(.*?)\"', txt)
    while (cont != None):
        txt = urlMonkey.getURL(url+'&acfrom='+'%20'.join(cont.group(1).split(' ')))
        res.extend(re.findall('preserve\">(.*?)</c>', txt))
        cont = re.search('query-continue><allcategories acfrom=\"(.*?)\"', txt)
    res.extend(re.findall('preserve\">(.*?)</c>', txt))
    return res
Example #2
0
def search(title, base=BASEURL):
    title = re.sub(' ', '_', title)
    url = base+'?action=query&format=xml&list=search&srsearch='+title
    txt = urlMonkey.getURL(url)
    
    #print txt
    return re.findall('suggestion=\"(.+?)\"', txt) + re.findall('title=\"(.+?)\"', txt)
Example #3
0
def queryInterface(title, depth=0):
    url = ''.join(['http://en.wikipedia.org/w/index.php?title=Special:Search&search=',
                   '+'.join(cgi.escape(title).split(' '))])
    txt = urlMonkey.getURL(url)
    try:
            wtitles = [re.search('<title>(.+?) - Wikipedia', txt).group(1)]
            didyoumean = []
    except:
            return []
    
    if 'search results' in wtitles[0].lower():
        print 'hit search page'
        wtitles = re.findall('href=\"/wiki/(.+?)\"', txt)
        wtitles = filter(lambda x: ':' not in x and '#' not in x and x!='Main_Page', wtitles)
        
        didyoumean = re.search('Did you mean:.+?search=(.+?)[&\"]', txt)
        if didyoumean != None:
            print 'hit search suggestion'
            didyoumean = re.sub('\+', ' ', didyoumean.group(1))
            if depth < 2:
                didyoumean = queryInterface(didyoumean, depth+1)
            else:
                return wtitles+[didyoumean]
        else:
            didyoumean = []
    else:
        print 'hit direct match'
            
    return wtitles+didyoumean
Example #4
0
def getCategories(title, base=BASEURL):
    title = reformat(title)
    url = base+'?action=query&format=xml&titles='+title+'&prop=categories&cllimit=45'
    txt = urlMonkey.getURL(url)
    return filter(lambda x:
                  'Redirects' not in x
                  and 'All' not in x
                  and 'Articles' not in x,
                  re.findall('title=\"Category:(.+?)\"', txt))
Example #5
0
def getLinkToArticle(title, base=BASEURL):
    title = reformat(title)
    try:
        title = getSearchSuggestions(title)[0]
        url = base+'?action=query&titles='+title+'&format=xml&prop=info&inprop=url&redirects'
        txt = urlMonkey.getURL(url)
        return re.search('fullurl=\"(.+?)\"', txt).group(1)
    except:
        return '#NA'
Example #6
0
def isPerson(title):
    #title = reformat(title)
    title = '_'.join(title.split(' '))
    url = BASEURL+'?action=query&format=xml&titles='+title+'&prop=categories&cllimit=45&redirects'
    txt = urlMonkey.getURL(url)

    cats = re.findall('title=\"Category:(.+?)\"', txt)
    res = False
    for c in map(lambda x: x.lower(), cats):
        res = (res or
               'births' in c or
               'deaths' in c or
               'person' in c)
    return res
Example #7
0
def getCatMembers(cat):
        if 'Category:' not in cat:
                cat = 'Category:' + cat

        cont = re.compile('cmcontinue=\"(.+?)\"')
        titles = re.compile('title=\"(.+?)\"')
                
        cat = re.sub(' ', '_', cat)
        url = (BASEURL+'?action=query&format=xml&list=categorymembers&cmtitle='+
               cat+'&cmlimit=max')
        txt = urlMonkey.getURL(url)

        furl = url
        res = titles.findall(txt)

        m = cont.search(txt)
        while m != None:
                url = furl+'&cmcontinue='+m.group(1)
                txt = urlMonkey.getURL(url)
                res += titles.findall(txt)

                m = cont.search(txt)
        
        return res
Example #8
0
def getText(title, base=BASEURL, rfmt=None, shelve_only=False):
    #returns text with specified wiki titles
    title = re.sub(' ', '_', title)

    #check local copy first
    try:
            txt = SHM.getArticleText(title)
            if txt != None:
                    return txt
    except:
            pass

    if shelve_only:
        print 'miss', title
        return ''
    
    if rfmt != None:
        title = rfmt(title)
    url = base+'?action=query&titles='+title+'&format=xml&prop=revisions&rvprop=content&redirects'
    #print url in urlMonkey.UCH.cache_dict
    
    txt = urlMonkey.getURL(url)
    return txt
Example #9
0
def bing_getURL(url):
    return urlMonkey.getURL(url, WIKI=False)
    '''
Example #10
0
def getInfo(title):
    title = reformat(title)
    url = BASEURL+'?action=query&format=xml&titles='+title+'&prop=info&redirects'
    txt = urlMonkey.getURL(url)
    return re.findall('pageid=\"(.+?)\"', txt)
Example #11
0
def getSearchSuggestions(title, base=BASEURL, limit=20):
    title=reformat(title)
    url = base+'?action=opensearch&search='+title+'&format=xml&llimit='+str(limit)
    txt = urlMonkey.getURL(url)
    return map(lambda x: reformat(x), re.findall('<Text.*?>(.*?)</Text>', txt))