def get_locations_fromURL(self, url): """ Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in 'cnn.com/taiwan/protest.html' Params: url - a web url Returns: Dict of locations obtained from URL """ results = {} urlinfo = urlparse(url) if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] # Find URL DOMAIN Country from 2 letter iso-code if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 results["URL-DOMAIN_{}".format(urlcountry)] = LocationDistribution(urlcountry) results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1 if 5 < len(urlsubject) < 20: usubj_q = self.gazetteer.query(urlsubject, 15000) if usubj_q: results["URL-SUBJECT_{}".format(urlsubject)] = LocationDistribution(usubj_q) results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1 return results
def fuzzyquery(self, locmap, countryFilter=[]): for loc in locmap: if len(locmap[loc].realizations) != 1: freq = locmap[loc].frequency subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') if subres != []: locmap[loc] = LocationDistribution(subres) locmap[loc].frequency = freq return locmap
def fuzzyquery(self, locmap, countryFilter=[]): for loc in locmap: if len(locmap[loc].realizations) != 1: freq = locmap[loc].frequency subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') if subres != []: pts = subres + locmap[loc].realizations.values() ldist = self.gazetteer._get_loc_confidence( pts, self.min_popln) locmap[loc] = LocationDistribution(ldist) locmap[loc].frequency = freq return locmap
def geocode_fromList(self, locTexts, results=None, min_popln=None, **kwargs): if results is None: results = {} if min_popln is None: min_popln = self.min_popln itype = {} for l in locTexts: if l == "": continue if isinstance(l, tuple): itype[l[0]] = l[1] l = l[0] else: itype[l] = 'LOCATION' try: if l in results: results[l].frequency += 1 else: for sub in l.split(","): sub = sub.strip() if sub in results: results[sub].frequency += 1 else: itype[sub] = itype[l] try: # # Exclusion # list_exclude = ['city','town'] # for ext in list_exclude: # if ext in sub: # sub = sub.replace(ext, "") query = self.gazetteer.query(sub, min_popln=min_popln,**kwargs) results[sub] = LocationDistribution(query) except UnicodeDecodeError: ipdb.set_trace() results[sub].frequency = 1 except UnicodeDecodeError: log.exception("Unable to make query for string - {}".format(encode(l))) scores = self.score(results) custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score']) lrank = self.get_locRanks(scores, results) lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}} total_weight = sum([self.weightage[itype.get(key, 'OTHER')] for key in lmap]) return lmap, max(lmap.items(), key=lambda x: x[1]['score'] * self.weightage[itype.get(x[0], 'OTHER')] / total_weight)[1]['geo_point'] if scores else {}
def geocode(self, doc): """ """ def getEntityDetails(entity): """ return entity string, starting offset, coverage end point """ start, end = entity['offset'].split(":") start, end = int(start), int(end) return (entity['expr'], start, start - self.coverageLength, end + self.coverageLength) urlinfo = urlparse(doc["url"]) loc_results = {} locTexts = [ getEntityDetails(l) for l in doc["BasisEnrichment"]['entities'] if l['neType'] == 'LOCATION' ] if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 loc_results["url"] = LocationDistribution(urlcountry) loc_results["url"].frequency = 1 if len(urlsubject) < 20: locTexts.insert(0, (urlsubject, -1, -1, -1)) loc_results.update(self.query_gazetteer(self.group(locTexts))) scores = self.score(loc_results) custom_max = lambda x: max(x.realizations.viewvalues(), key=lambda x: scores[x.__str__()]) lmap = { l: custom_max(loc_results[l]['geo-point']) for l in loc_results if not loc_results[l]['geo-point'].isEmpty() } egeo = {} if scores: egeo = scores[max(scores, key=lambda x: scores[x])] return lmap, egeo
def _queryitem(self, item, itemtype, **kwargs): if itemtype == "LOCATION": res = self.gazetteer.query(item, **kwargs) else: res = self.gazetteer.query(item, fuzzy='AUTO', featureCode='pcli', operator='or') if res == []: res = self.gazetteer.query(item, featureCode='adm1', operator='or') return LocationDistribution(res)
def query_gazetteer(self, lgroups): """ get Location groups """ gp_map = {} query_gp = lambda x: self.gazetteer.query( x) if x not in gp_map else gp_map[x] for grp in lgroups: imap = {txt: query_gp(txt) for txt in grp} imap = self.get_geoPoints_intersection(imap) for l in imap: if l in gp_map: gp_map[l]['frequency'] += 1 else: gp_map[l] = {'geo-point': imap[l], 'frequency': 1} #gp_map.update(imap) for l in gp_map: gp_map[l]['geo-point'] = LocationDistribution( gp_map[l]['geo-point']) return gp_map
def geocode_fromList(self, locTexts, results=None, min_popln=None): if results is None: results = {} if min_popln is None: min_popln = self.min_popln for l in locTexts: try: if l in results: results[l].frequency += 1 else: q = self.gazetteer.query(l, min_popln=min_popln) if not q: for sub in l.split(","): sub = sub.strip() if sub in results: results[sub].frequency += 1 else: results[sub] = LocationDistribution( self.gazetteer.query(sub, min_popln=min_popln)) results[sub].frequency = 1 else: results[l] = LocationDistribution(q) results[l].frequency = 1 except: log.exception("Unable to make query for string - {}".format( encode(l))) scores = self.score(results) custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score']) lrank = self.get_locRanks(scores, results) lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}} #ipdb.set_trace() return lmap, max( lmap.values(), key=lambda x: x['score'])['geo_point'] if scores else {}
print("gpool created") tp = ThreadPool(10) print("tpool created") #mp = (10) #print "mpool created" def t(l): return gn.query(l, min_popln=0) ##strt = time.clock() ##s = mp.map(t, loclist) ##end = time.clock() ##print "multiprocessing pool: {}".format(end-strt) strt = time.clock() s = tp.map(t, loclist) end = time.clock() print("Threadpool processing: {}".format(end - strt)) strt = time.clock() s = gp.map(t, loclist) end = time.clock() print("gevent pool processing: {}".format(end - strt)) strt = time.clock() s = [LocationDistribution(l) for l in s] end = time.clock() print("LD time:{}".format(end - strt))