def get_page_id(v): pageid = None #http://it.wikipedia.org/w/api.php?action=query&titles=Abbazia_di_San_Galgano&format=json #{"query":{"normalized":[{"from":"Abbazia_di_San_Galgano","to":"Abbazia di San Galgano"}],"pages":{"83117":{"pageid":83117,"ns":0,"title":"Abbazia di San Galgano"}}}} queryurl = UrlBuilder(domain=WPDOMAIN,path="w/api.php",params="action=query") queryurl.set_attr('titles',v) queryurl.set_attr('format','json') query=queryurl.build() for ntry in range(1,MAXTRIES): print "Request no. %d - Requesting %s" %(ntry,query) jsonpage = urllib2.urlopen(query) try: jobj = json.load(jsonpage) pageid = int(jobj['query']['pages'].keys()[0]) break except Exception as e: print e pageid = None time.sleep(5) continue return pageid
for row in inlist: row[:] = [r.encode('utf-8') for r in row] wikipage = row[0] osm_id = row[1] osm_type = row[2] osm_lon = row[3] osm_lat = row[4] osm_element = ELEMENTS[osm_type] osmUrl = UrlBuilder(domain=OSMURL, path='', params='{osm_element}={osm_id}'.format( osm_element=osm_element, osm_id=osm_id), attrs={'mlon': osm_lon, 'mlat': osm_lat } ) osmurl = osmUrl.build() print row row[0] = '[[{wikipage}|{pagename}]]'.format( wikipage=wikipage, pagename=wikipage.replace('_',' ')) row[1] = '[{osmurl} {osm_id}]'.format(osmurl=osmurl, osm_id=osm_id) txt = '|'
u'Architetto', u'StileArchitett', u'InizioCostr', u'FineCostr', u'Demolizione', u'Sito', u'lat', u'long' ] """ Utility functions """ _jsonu = UrlBuilder( domain="json.it.dbpedia.org", path="annotate/resource/json/it%3A{wp-page}", params="filter=__type:template" ) _jsonu.set_attr('flags','-Extractors,Structure,') _jsonbaseurl=_jsonu.build() def get_jsonpedia_page(v): """ Gets the corrisponding JSONpedia page (only templates) for Wikipedia article titled 'v'. Tries MAXTRIES times or returns none. """ vsafe = v.replace(' ','_') jsonurl = _jsonbaseurl.replace('{wp-page}',urllib.quote(vsafe)) for ntry in range(1,MAXTRIES): try:
def query_api(): queryurl = UrlBuilder(domain=WPDOMAIN,path="w/api.php",params="action=query") queryurl.set_attr('generator','embeddedin') queryurl.set_attr('geititle',WPTNAME) queryurl.set_attr('einamespace','0') queryurl.set_attr('geilimit','500') queryurl.set_attr('format','xml') inlist=list() while True: print "Requesting %s" %queryurl.build() infile = urllib2.urlopen(queryurl.build()) inxml = infile.read() xml = parseString(inxml) pagelist=xml.getElementsByTagName("page") for page in pagelist: inlist.append(page.getAttribute("title")) querycont=xml.getElementsByTagName("embeddedin") if len(querycont) == 0: break geicontinue=querycont[0].getAttribute("geicontinue") queryurl.set_attr("geicontinue",geicontinue) time.sleep(5) return inlist
# You should have received a copy of the GNU General Public License # along with this program (see COPYING). # If not, see <http://www.gnu.org/licenses/>. ######################################################################### import logging from wocmod.wocurlbuilder import UrlBuilder from wocmod.wocjson import JSONQuerier from wocmod.wocdb import MySQLConnector,PostgreSQLConnector from wocmod.wocglobal import WOC logger = logging.getLogger('woc.woccoords') _jsonu = UrlBuilder( domain="json.it.dbpedia.org", path="annotate/resource/json/it%3A{wppage}", params="filter=__type:template" ) _jsonu.set_attr('flags','-Extractors,Structure,') JSONPEDIABASEURL=_jsonu.build() class CoordinateGetter(object): def __init__(self,item): self.item = item self.coords = None def _from_db(self): pass