Beispiel #1
0
class SupervisedGeo(object):
    def __init__(self,
                 db,
                 min_popln=0,
                 min_length=1,
                 model="./geoModels/rf_geo.pkl"):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0
        }
        with open(model, "rb") as inf:
            self.model = pickle.load(inf)

    def _build_data(self,
                    doc=None,
                    loclist=None,
                    eKey='BasisEnrichment',
                    **kwargs):
        locTexts, persons = [], []
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.weightage if self.weightage[key] > 0
        ]
        if doc is not None:
            doclength = len(doc[eKey]['tokens'])

            locTexts += [
                (numstrip.sub("", l['expr'].lower()).strip(), l['neType'],
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK)
                    and len(l['expr']) >= self.min_length)
            ]

            persons = [
                (numstrip.sub("", l['expr'].lower()).strip(),
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] == "PERSON")
                    and len(l['expr']) >= self.min_length)
            ]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        return self._esquery_fromList(locTexts,
                                      persons,
                                      doclength=doclength,
                                      **kwargs)

    def _esquery_fromList(self,
                          locTexts,
                          persons,
                          results=None,
                          min_popln=None,
                          **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        meta_entInfo = {}
        realized_countries = []
        idx = 0
        offsetmat = []
        for entitem in locTexts:
            querytext, enttype, offset = entitem
            if isempty(querytext):
                continue

            if querytext in results:
                results[querytext].frequency += 1
                meta_entInfo[querytext]["offsets"].append(offset)
                meta_entInfo[querytext]["neType"] = (enttype)
                meta_entInfo[querytext]["indexes"].append(idx)
                offsetmat.append(offset)
            else:
                for subidx, substr in enumerate(querytext.split(",")):
                    substr = substr.strip()
                    if substr in results:
                        results[substr].frequency += 1
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                        continue

                    if substr not in meta_entInfo:
                        meta_entInfo[substr] = {
                            "offsets":
                            [offset + float(subidx) / kwargs['doclength']],
                            "neType":
                            enttype,
                            "indexes": [idx + subidx]
                        }
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                    else:
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])

                    ld = self._queryitem(substr,
                                         meta_entInfo[substr]["neType"])
                    if meta_entInfo[substr][
                            "neType"] != "LOCATION" and ld.isempty():
                        continue

                    results[substr] = ld
                    if len(results[substr].realizations) == 1:
                        realized_countries.append(
                            list(results[substr].realizations.values())[0]
                            ['countryCode'].lower())

                    results[substr].frequency = 1
                idx += subidx

            idx += 1

        offsetmat = np.array(offsetmat)
        offset_diffmat = offsetmat[:, np.newaxis] - offsetmat
        selco = realized_countries
        #realized_countries = Counter(realized_countries)
        #co_realized = float(sum(realized_countries.values()))
        #selco = [kl for kl, vl in realized_countries.viewitems()
        #         if float(vl/co_realized) >= 0.5]
        #try:
        #    selco = realized_countries.most_common(n=1)[0][0]
        #except:
        #    selco = []

        persons_res = {}
        for entitem in persons:
            querytext, offset = entitem
            if querytext not in persons_res:
                diffs = offsetmat - offset
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "offset":
                    diffs,
                    "freq":
                    1
                }

            else:
                persons_res[querytext]["freq"] += 1

        if not isempty(selco):
            results = self.fuzzyquery(results, countryFilter=selco)

        freqsheet = self.score(results, meta_entInfo)

        return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res, selco

    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)

    def fuzzyquery(self, locmap, countryFilter=[]):
        for loc in locmap:
            if len(locmap[loc].realizations) != 1:
                freq = locmap[loc].frequency
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                if subres != []:
                    locmap[loc] = LocationDistribution(
                        subres + locmap[loc].realizations.values())
                    locmap[loc].frequency = freq
        return locmap

    def score(self, results, metaInfo):
        scoresheet = defaultdict(lambda: defaultdict(lambda: {
            "freq": 0.0,
            "offs_idx": []
        }))
        num_mentions = float(sum((l.frequency for l in results.values())))

        def update(key, l):
            offs = metaInfo[key]["indexes"]
            for s in l.city:
                scoresheet["city"][s]['freq'] += l.frequency
                scoresheet["city"][s]['offs_idx'] += (offs)

            for s in l.admin1:
                scoresheet["admin1"][s]["freq"] += l.frequency
                scoresheet["admin1"][s]['offs_idx'] += (offs)

            for s in l.country:
                scoresheet["country"][s]["freq"] += l.frequency
                scoresheet["country"][s]['offs_idx'] += (offs)

        _ = [update(key, val) for key, val in results.viewitems()]

        for typ in scoresheet:
            for s in scoresheet[typ]:
                scoresheet[typ][s]['freq'] /= num_mentions

            scoresheet[typ].default_factory = None

        scoresheet.default_factory = None
        return scoresheet

    def geocode(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs):
        """
        Attach embersGeoCode to document
        """
        eKey = None
        for key in enrichmentKeys:
            if key in doc and doc[key]:
                eKey = key

        if eKey is None:
            return doc

        all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res, selco = self._build_data(
            doc)
        if "events" in doc:
            self._expand_events(doc)

        locdist = {}
        clfdata = {}
        for loc in all_exp_locs:
            x, names = self.build_featuremat(all_exp_locs[loc], offsdiffmat,
                                             freqsheet)
            if x != []:
                clfdata[loc] = zip(names, x)
                ypred = self.model[1].predict_proba(
                    self.model[0].transform(x))[:, 1]
                prob, final_nm = max(zip(ypred, names), key=lambda lx: lx[0])
                locdist[loc] = {
                    "conf": prob,
                    "details":
                    all_exp_locs[loc].realizations[final_nm].__dict__
                }

        person_dist = {}
        for loc in persons_res:
            exps = persons_res[loc]["expansions"]
            x, names = [], []
            for real in exps.realizations:
                d1 = self.build_persmat(exps.realizations[real],
                                        persons_res[loc], freqsheet)
                x.append(d1)
                names.append(real)

            if x != []:
                clfdata[loc] = zip(names, x)
                ypred = self.model[1].predict(self.model[0].transform(x))
                pred, nm = max(zip(ypred, names), key=lambda lx: lx[0])
                if pred is True:
                    person_dist[loc] = exps.realizations[nm].__dict__

        true_geos = self.matchwithGSRLocs(doc, all_exp_locs, persons_res,
                                          offsdiffmat, freqsheet)
        doc['true_geos'] = true_geos
        doc['location_distribution'] = locdist
        doc['person_dist'] = person_dist
        doc['geo_debug'] = {"selco": selco, "clfdata": clfdata}
        return doc

    def calc_offset_stats(self, indices, diffmat):
        tril = np.tril(diffmat[indices])
        ntril = tril[np.nonzero(tril)]
        abstril = np.abs(ntril)
        if abstril.shape[0] == 0:
            return 1, 1, 1, 1

        abs_minval = np.min(abstril)
        medval = np.mean(abstril)

        try:
            before_closest = np.min(ntril[ntril > 0])
        except:
            before_closest = 1

        try:
            after_closest = abs(np.max(ntril[ntril < 0]))
        except:
            after_closest = 1

        return medval, abs_minval, before_closest, after_closest

    def _single_build_featuremat(self, realization, diffmat, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        offs = freqsheet["country"][country + "//"]["offs_idx"]
        co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)

        try:
            offs = freqsheet["admin1"][admin + "/"]["offs_idx"]
            st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
        except:
            st_offset = [0, 0, 0, 0]

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            try:
                offs = freqsheet["city"][city]["offs_idx"]
                ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
                cifreq = freqsheet["city"][city]["freq"]
            except:
                ci_offset = [1, 1, 1, 1]
                cifreq = 0
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country":
            freqsheet["country"][country + "//"]["freq"],
            "state":
            freqsheet.get("admin1", {}).get(admin + "/", {}).get("freq", 0),
            "city":
            cifreq,
            "poplnConf":
            realization.poplnConf,
            "co_Offmean":
            co_offset[0],
            "co_Offmin":
            co_offset[1],
            "co_prev":
            co_offset[2],
            "co_after":
            co_offset[3],
            "st_offmean":
            st_offset[0],
            "st_offmin":
            st_offset[1],
            "st_prev":
            st_offset[2],
            "st_after":
            st_offset[3],
            "ci_offmean":
            ci_offset[0],
            "ci_offmin":
            ci_offset[1],
            "ci_prev":
            ci_offset[2],
            "ci_after":
            ci_offset[3]
        }

    def build_persmat(self, realization, meta_info, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        co_offset = self.calc_offset_stats(
            freqsheet["country"][country + "//"]["offs_idx"],
            meta_info['offset'])

        if (admin + "/") in freqsheet["admin1"]:
            st_offset = self.calc_offset_stats(
                freqsheet["admin1"][admin + "/"]["offs_idx"],
                meta_info["offset"])
            st_freq = freqsheet["admin1"][admin + "/"]["freq"]
        else:
            st_offset = [1, 1, 1, 1]
            st_freq = meta_info['freq']

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            if city in freqsheet.get("city", {}):
                ci_offset = self.calc_offset_stats(
                    freqsheet["city"][city]["offs_idx"], meta_info["offset"])
                cifreq = freqsheet["city"][city]["freq"]
            else:
                ci_offset = [1, 1, 1, 1]
                cifreq = meta_info["freq"]
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": st_freq,
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }
        #self.build_persmat(persons_res[loc].realizations[x])

    def build_featuremat(self, loc, *args):
        xmat = []
        lbls = []
        for real in loc.realizations:
            x = self._single_build_featuremat(loc.realizations[real], *args)
            lbls.append(real)
            xmat.append(x)

        #if xmat != []:
        #    xmat = self.model[0].transform(xmat)

        return xmat, lbls

    def _expand_events(self, doc):
        for evt in doc["events"]:
            if "expanded_loc" in evt:
                continue

            try:
                loc = self.gazetteer.get_locInfo(country=evt['Country'],
                                                 admin=evt['State'],
                                                 city=evt["City"])
                evt['expanded_loc'] = loc
            except Exception as e:
                pass
        return

    def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat,
                         freqsheet):
        locstrings = set()
        for evt in doc['events']:
            estr = u"/".join([evt['Country'], evt['State'], evt['City']])
            locstrings.add(estr)
            if "expanded_loc" in evt:
                for loc in evt['expanded_loc']:
                    gp = GeoPoint(**loc)
                    lstr = "/".join([
                        gp.country, gp.admin1,
                        (getattr(gp, "admin2", "") or gp.city)
                    ])
                    locstrings.add(lstr)

        matched_locs = set()
        true_geos = {'persons': {}, 'locations': {}}
        for loc in all_exp_locs:
            for x in all_exp_locs[loc].realizations:
                if x in locstrings:
                    true_geos['locations'][loc] = all_exp_locs[
                        loc].realizations[x].__dict__

        remaininglocs = locstrings - matched_locs
        for loc in persons_res:
            for x in persons_res[loc]["expansions"].realizations:
                if x in remaininglocs:
                    true_geos['persons'][loc] = persons_res[loc][
                        "expansions"].realizations[x].__dict__

        return true_geos
Beispiel #2
0
class BaseGeo(object):
    def __init__(self, db, min_popln=0, min_length=1):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.2
        }

    def geocode(self, doc=None, loclist=None, **kwargs):
        locTexts = []
        if doc is not None:
            # Get all location entities from document with atleast min_length characters
            locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), l['neType']) for l in
                         doc["BasisEnrichment"]["entities"]
                         if ((l["neType"] in ("LOCATION", "NATIONALITY")) and
                             len(l['expr']) >= self.min_length)]

            # locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), 'OTHER') for l in
            #              doc['BasisEnrichment']['nounPhrases']]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        results = self.get_locations_fromURL((doc["url"] if doc.get("url", "")
                                              else doc.get("link", "")))
        # results = {}
        # kwargs['analyzer'] = 'standard'
        return self.geocode_fromList(locTexts, results, **kwargs)

    def geocode_fromList(self, locTexts, results=None, min_popln=None, **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        itype = {}
        for l in locTexts:
            if l == "":
                continue
            if isinstance(l, tuple):
                itype[l[0]] = l[1]
                l = l[0]
            else:
                itype[l] = 'LOCATION'
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    for sub in l.split(","):
                        sub = sub.strip()
                        if sub in results:
                            results[sub].frequency += 1
                        else:
                            itype[sub] = itype[l]
                            try:
                                # # Exclusion
                                # list_exclude = ['city','town']
                                # for ext in list_exclude:
                                #     if ext in sub:
                                #         sub = sub.replace(ext, "")
                                query = self.gazetteer.query(sub, min_popln=min_popln,**kwargs)
                                results[sub] = LocationDistribution(query)
                            except UnicodeDecodeError:
                                ipdb.set_trace()
                            results[sub].frequency = 1
            except UnicodeDecodeError:
                log.exception("Unable to make query for string - {}".format(encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(),
                                   key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        total_weight = sum([self.weightage[itype.get(key, 'OTHER')] for key in lmap])
        return lmap, max(lmap.items(),
                         key=lambda x: x[1]['score'] * self.weightage[itype.get(x[0], 'OTHER')] / total_weight)[1]['geo_point'] if scores else {}

    def get_locations_fromURL(self, url):
        """
        Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in
        'cnn.com/taiwan/protest.html'

        Params:
            url - a web url

        Returns:
            Dict of locations obtained from URL
        """
        results = {}
        urlinfo = urlparse(url)
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            # Find URL DOMAIN Country from 2 letter iso-code
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    results["URL-DOMAIN_{}".format(urlcountry)] = LocationDistribution(urlcountry)
                    results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1

            if 5 < len(urlsubject) < 20:
                usubj_q = self.gazetteer.query(urlsubject, 15000)
                if usubj_q:
                    results["URL-SUBJECT_{}".format(urlsubject)] = LocationDistribution(usubj_q)
                    results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1
        return results

    def annotate(self, doc, **kwargs):
        """
        Attach embersGeoCode to document
        """
        try:
            lmap, gp = self.geocode(doc=doc, **kwargs)
        except UnicodeDecodeError as e:
            log.exception("unable to geocode:{}".format(str(e)))
            lmap, gp = {}, {}

        doc['embersGeoCode'] = gp
        doc["location_distribution"] = lmap
        return doc

    def update(self,l,scoresheet):
        for s in l.city:
            scoresheet[s] += l.city[s] * l.frequency
        for s in l.admin1:
            scoresheet[s] += l.admin1[s] * l.frequency
        for s in l.country:
            scoresheet[s] += l.country[s] * l.frequency

    def score(self, results):
        scoresheet = defaultdict(float)
        num_mentions = float(sum((l.frequency for l in results.values())))

        _ = [self.update(item,scoresheet) for item in results.viewvalues()]
        for s in scoresheet:
            scoresheet[s] /= num_mentions

        return scoresheet

    def get_realization_score(self,l,scores):
        lscore_map = {}
        for lstr, r in l.realizations.viewitems():
            base_score = scores[lstr]
            # if r.ltype == 'city':
            if not isempty(r.city):
                l_adminstr = '/'.join([r.country, r.admin1, ''])
                base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence

            elif not isempty(r.admin1):
                base_score = (base_score + scores[r.country + "//"]) * r.confidence

            elif r.ltype == "country":
                # do nothing
                pass
            else:
                base_score = base_score * r.confidence
                # code for other types
                # if not isempty(r.city):
                #    l_adminstr = '/'.join([r.country, r.admin1, ''])
                #    base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence

                # ipdb.set_trace()
                # raise Exception("Unknown location type-{} for {}".format(r.ltype, lstr))

            lscore_map[lstr] = {'score': base_score, 'geo_point': r.__dict__}

        # for s in l.realizations:
        #    base_score = scores[s]
        #    if l.realizations[s].ltype not in ('country', 'admin'):
        #        l_adminstr = encode('/'.join([l.realizations[s].country,
        #                               l.realizations[s].admin1, '']))

        #        base_score += scores[l_adminstr] + scores[l.realizations[s].country]

        #    elif l.realizations[s].ltype == 'admin':
        #        base_score += scores[l.realizations[s].country]

        #    lscore_map[s] = {'score': base_score, 'geo_point': l.realizations[s].__dict__}
        return lscore_map

    def get_locRanks(self, scores, loc_cand):
        """
        Each city score needs to be re-inforced with the
        corresponding state and country scores to get the actual meaning
        of that name. For example, several mentions of cities within virginia
        would have given virginia
        state a high score. Now this high score has to be brought back to lower levels to
        decide on meaning of each name/city
        """
        loc_rankmap = {}

        for locpt in loc_cand:
            loc_rankmap[locpt] = self.get_realization_score(loc_cand[locpt],scores)
        return loc_rankmap
Beispiel #3
0
class BaseGeo(object):
    def __init__(self,
                 dbpath="./Geonames_dump.sql",
                 min_popln=0,
                 min_length=1):
        self.gazetteer = GeoNames(dbpath)
        self.min_popln = min_popln
        self.min_length = min_length

    def geocode(self, doc=None, loclist=None):
        locTexts = []
        if doc is not None:
            # Get all location entities from document with atleast min_length characters
            locTexts += [
                l['expr'].lower() for l in doc["BasisEnrichment"]["entities"]
                if ((l["neType"] == "LOCATION")
                    and len(l['expr']) >= self.min_length)
            ]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        results = self.get_locations_fromURL(
            (doc["url"] if doc.get("url", "") else doc.get("link", "")))
        return self.geocode_fromList(locTexts, results)

    def geocode_fromList(self, locTexts, results=None, min_popln=None):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        for l in locTexts:
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    q = self.gazetteer.query(l, min_popln=min_popln)
                    if not q:
                        for sub in l.split(","):
                            sub = sub.strip()
                            if sub in results:
                                results[sub].frequency += 1
                            else:
                                results[sub] = LocationDistribution(
                                    self.gazetteer.query(sub,
                                                         min_popln=min_popln))
                                results[sub].frequency = 1
                    else:
                        results[l] = LocationDistribution(q)
                        results[l].frequency = 1
            except:
                log.exception("Unable to make query for string - {}".format(
                    encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        #ipdb.set_trace()
        return lmap, max(
            lmap.values(),
            key=lambda x: x['score'])['geo_point'] if scores else {}

    def get_locations_fromURL(self, url):
        """
        Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in
        'cnn.com/taiwan/protest.html'

        Params:
            url - a web url

        Returns:
            Dict of locations obtained from URL
        """
        results = {}
        urlinfo = urlparse(url)
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            # Find URL DOMAIN Country from 2 letter iso-code
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    results["URL-DOMAIN_{}".format(
                        urlcountry)] = LocationDistribution(urlcountry)
                    results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1

            if self.min_length < len(urlsubject) < 20:
                usubj_q = self.gazetteer.query(urlsubject, 15000)
                if usubj_q:
                    results["URL-SUBJECT_{}".format(
                        urlsubject)] = LocationDistribution(usubj_q)
                    results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1
        return results

    def annotate(self, doc):
        """
        Attach embersGeoCode to document
        """
        try:
            lmap, gp = self.geocode(doc=doc)
        except Exception, e:
            log.exception("unable to geocode:{}".format(str(e)))
            lmap, gp = {}, {}

        doc['embersGeoCode'] = gp
        doc["location_distribution"] = lmap
        return doc
Beispiel #4
0
class TextGeo(object):
    def __init__(self,
                 dbpath="./Geonames_dump.sql",
                 min_popln=0,
                 coverageLength=10):
        """
        Description
        """
        self.coverageLength = coverageLength
        self.gazetteer = GeoNames("./Geonames_dump.sql")
        self.min_popln = min_popln

    def geocode(self, doc):
        """

        """
        def getEntityDetails(entity):
            """
            return entity string, starting offset, coverage end point
            """
            start, end = entity['offset'].split(":")
            start, end = int(start), int(end)
            return (entity['expr'], start, start - self.coverageLength,
                    end + self.coverageLength)

        urlinfo = urlparse(doc["url"])
        loc_results = {}
        locTexts = [
            getEntityDetails(l) for l in doc["BasisEnrichment"]['entities']
            if l['neType'] == 'LOCATION'
        ]
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    loc_results["url"] = LocationDistribution(urlcountry)
                    loc_results["url"].frequency = 1
            if len(urlsubject) < 20:
                locTexts.insert(0, (urlsubject, -1, -1, -1))

        loc_results.update(self.query_gazetteer(self.group(locTexts)))

        scores = self.score(loc_results)
        custom_max = lambda x: max(x.realizations.viewvalues(),
                                   key=lambda x: scores[x.__str__()])
        lmap = {
            l: custom_max(loc_results[l]['geo-point'])
            for l in loc_results if not loc_results[l]['geo-point'].isEmpty()
        }
        egeo = {}
        if scores:
            egeo = scores[max(scores, key=lambda x: scores[x])]
        return lmap, egeo

    def score(self, results):
        scoresheet = defaultdict(float)

        def update(item):
            l = item['geo-point']
            freq = item['frequency']
            for s in l.city:
                scoresheet[s] += l.city[s] * freq
            for s in l.admin1:
                scoresheet[s] += l.admin1[s] * freq
            for s in l.country:
                scoresheet[s] += l.country[s] * freq

        [update(item) for item in results.viewvalues()]
        return scoresheet

    def query_gazetteer(self, lgroups):
        """
        get Location groups
        """
        gp_map = {}
        query_gp = lambda x: self.gazetteer.query(
            x) if x not in gp_map else gp_map[x]
        for grp in lgroups:
            imap = {txt: query_gp(txt) for txt in grp}
            imap = self.get_geoPoints_intersection(imap)
            for l in imap:
                if l in gp_map:
                    gp_map[l]['frequency'] += 1
                else:
                    gp_map[l] = {'geo-point': imap[l], 'frequency': 1}

            #gp_map.update(imap)

        for l in gp_map:
            gp_map[l]['geo-point'] = LocationDistribution(
                gp_map[l]['geo-point'])

        return gp_map

    def group(self, loc):
        groups = []
        i = 0
        while i < len(loc):
            grp = [loc[i][0]]
            for j, l in enumerate(loc[i + 1:]):
                if l[1] <= loc[i][-1]:
                    grp.append(l[0])
                    i += 1
                else:
                    groups.append(grp)
                    i += 1
                    grp = [loc[i][0]]
                    break
            else:
                groups.append(grp)
                i += 1
        return groups

    def get_geoPoints_intersection(self, gps):
        try:
            selcountry = set.intersection(
                *[set([l.country]) for name in gps for l in gps[name]])
        except:
            selcountry = None

        if not selcountry:
            return gps

        selcountry = selcountry.pop()
        filtered_gps = [
            set([encode('/'.join([l.country, l.admin1, ""]))]) for name in gps
            for l in gps[name] if l.country == selcountry
        ]

        sel_admin1 = set.intersection(*filtered_gps)
        if not sel_admin1:
            return {
                name: [l for l in gps[name] if l.country == selcountry]
                for name in gps
            }

        sel_admin1 = sel_admin1.pop()
        ns = {}
        for l in gps:
            t_admin = [gp for gp in gps[l] if gp.__str__() == sel_admin1]
            if t_admin != []:
                ns[l] = t_admin
                continue
            t_cand = [
                gp for gp in gps[l]
                if encode("/".join([gp.country, gp.admin1, ""])) == sel_admin1
            ]
            ns[l] = t_cand
        return ns
Beispiel #5
0
class BaseGeo(object):
    def __init__(self, db, min_popln=0, min_length=1):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0
        }

    def geocode(self,
                doc=None,
                loclist=None,
                eKey='BasisEnrichment',
                **kwargs):
        locTexts = []
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.weightage if self.weightage[key] > 0
        ]
        if doc is not None:
            # Get all location entities from document with atleast min_length characters
            locTexts += [(numstrip.sub("",
                                       l['expr'].lower()).strip(), l['neType'])
                         for l in doc[eKey]["entities"]
                         if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK)
                             and len(l['expr']) >= self.min_length)]

            # locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), 'OTHER') for l in
            # doc['BasisEnrichment']['nounPhrases']]
            persons = [(numstrip.sub("",
                                     l['expr'].lower()).strip(), l['neType'])
                       for l in doc[eKey]["entities"]
                       if ((l["neType"] == "PERSON")
                           and len(l['expr']) >= self.min_length)]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        results = self.get_locations_fromURL(
            (doc["url"] if doc.get("url", "") else doc.get("link", "")))
        # results = {}
        # kwargs['analyzer'] = 'standard'
        return self.geocode_fromList(locTexts, persons, results, **kwargs)

    def geocode_fromList(self,
                         locTexts,
                         persons,
                         results=None,
                         min_popln=None,
                         **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        itype = {}
        realized_countries = []
        for l in locTexts:
            if l == "":
                continue
            if isinstance(l, tuple):
                itype[l[0]] = l[1]
                l = l[0]
            else:
                itype[l] = 'LOCATION'
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    for sub in l.split(","):
                        sub = sub.strip()
                        if sub in results:
                            results[sub].frequency += 1
                        else:
                            itype[sub] = itype[l]
                            try:
                                results[sub] = self._queryitem(sub, itype[sub])
                                if len(results[sub].realizations) == 1:
                                    realized_countries.append(
                                        list(
                                            results[sub].realizations.values())
                                        [0]['countryCode'].lower())
                            except UnicodeDecodeError:
                                ipdb.set_trace()
                            results[sub].frequency = 1
            except UnicodeDecodeError:
                log.exception("Unable to make query for string - {}".format(
                    encode(l)))


#         realized_countries = Counter(realized_countries)
#         co_realized = float(sum(realized_countries.values()))
#         selco = [kl for kl, vl in realized_countries.viewitems()
#                                                  if float(vl/co_realized) >= 0.5]
#         try:
#             selco = realized_countries.most_common(n=1)[0][0]
#         except:
#             selco = []
        selco = list(set(realized_countries))

        if selco not in (None, "", []):
            results = self.fuzzyquery(results, countryFilter=selco)

        persons_res = {}
        for entitem in persons:
            querytext, _ = entitem
            if querytext not in persons_res:
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "freq":
                    1
                }
                if querytext not in results:
                    results[querytext] = persons_res[querytext]['expansions']
                    results[querytext].frequency = 1
            else:
                persons_res[querytext]["freq"] += 1
                results[querytext].frequency += 1

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        total_weight = sum(
            [self.weightage[itype.get(key, 'OTHER')] for key in lmap])
        return lmap, max(lmap.items(),
                         key=lambda x: x[1]['score'] * self.weightage[
                             itype.get(x[0], 'OTHER')] / total_weight
                         )[1]['geo_point'] if scores else {}

    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)

    def get_locations_fromURL(self, url):
        """
        Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in
        'cnn.com/taiwan/protest.html'

        Params:
            url - a web url

        Returns:
            Dict of locations obtained from URL
        """
        results = {}
        urlinfo = urlparse(url)
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            # Find URL DOMAIN Country from 2 letter iso-code
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    results["URL-DOMAIN_{}".format(
                        urlcountry)] = LocationDistribution(urlcountry)
                    results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1

            if 5 < len(urlsubject) < 20:
                usubj_q = self.gazetteer.query(urlsubject, 15000)
                if usubj_q:
                    results["URL-SUBJECT_{}".format(
                        urlsubject)] = LocationDistribution(usubj_q)
                    results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1
        return results

    def fuzzyquery(self, locmap, countryFilter=[]):
        for loc in locmap:
            if len(locmap[loc].realizations) != 1:
                freq = locmap[loc].frequency
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                if subres != []:
                    pts = subres + locmap[loc].realizations.values()
                    ldist = self.gazetteer._get_loc_confidence(
                        pts, self.min_popln)
                    locmap[loc] = LocationDistribution(ldist)
                    locmap[loc].frequency = freq
        return locmap

    def annotate(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs):
        """
        Attach embersGeoCode to document
        """
        eKey = None
        for key in enrichmentKeys:
            if key in doc and doc[key]:
                eKey = key

        if eKey is None:
            return doc

        try:
            lmap, gp = self.geocode(doc=doc, eKey=eKey, **kwargs)
        except UnicodeDecodeError as e:
            log.exception("unable to geocode:{}".format(str(e)))
            lmap, gp = {}, {}

        doc['embersGeoCode'] = gp
        doc["location_distribution"] = lmap
        return doc

    def score(self, results):
        scoresheet = defaultdict(float)
        num_mentions = float(sum((l.frequency for l in results.values())))

        def update(l):
            for s in l.city:
                scoresheet[s] += l.city[s] * l.frequency
            for s in l.admin1:
                scoresheet[s] += l.admin1[s] * l.frequency
            for s in l.country:
                scoresheet[s] += l.country[s] * l.frequency

        _ = [update(item) for item in results.viewvalues()]
        for s in scoresheet:
            scoresheet[s] /= num_mentions

        return scoresheet

    def get_locRanks(self, scores, loc_cand):
        """
        Each city score needs to be re-inforced with the
        corresponding state and country scores to get the actual meaning
        of that name. For example, several mentions of cities within virginia
        would have given virginia
        state a high score. Now this high score has to be brought back to lower levels to
        decide on meaning of each name/city
        """
        loc_rankmap = {}

        def get_realization_score(l):
            lscore_map = {}
            for lstr, r in l.realizations.viewitems():
                base_score = scores[lstr]
                #if r.ltype == 'city':
                if not isempty(r.city):
                    l_adminstr = '/'.join([r.country, r.admin1, ''])
                    base_score = (base_score + scores[l_adminstr] +
                                  scores[r.country + "//"]) * r.confidence

                elif not isempty(r.admin1):
                    base_score = (base_score +
                                  scores[r.country + "//"]) * r.confidence

                elif r.ltype == "country":
                    # do nothing
                    pass
                else:
                    base_score = base_score * r.confidence
                    # code for other types
                    #if not isempty(r.city):
                    #    l_adminstr = '/'.join([r.country, r.admin1, ''])
                    #    base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence

                    #ipdb.set_trace()
                    #raise Exception("Unknown location type-{} for {}".format(r.ltype, lstr))

                lscore_map[lstr] = {
                    'score': base_score,
                    'geo_point': r.__dict__
                }

            #for s in l.realizations:
            #    base_score = scores[s]
            #    if l.realizations[s].ltype not in ('country', 'admin'):
            #        l_adminstr = encode('/'.join([l.realizations[s].country,
            #                               l.realizations[s].admin1, '']))

            #        base_score += scores[l_adminstr] + scores[l.realizations[s].country]

            #    elif l.realizations[s].ltype == 'admin':
            #        base_score += scores[l.realizations[s].country]

            #    lscore_map[s] = {'score': base_score, 'geo_point': l.realizations[s].__dict__}
            return lscore_map

        for locpt in loc_cand:
            loc_rankmap[locpt] = get_realization_score(loc_cand[locpt])
        return loc_rankmap
Beispiel #6
0
class SteinerGeo():
    def __init__(self, db, nerKeyMap=None, spacy=False):
        self.gazetteer = GeoNames(db, confMethod='Uniform', escore=False)
        DEFAULT_NER_MAP = {
            'LOCATION': 'LOCATION',
            'ORGANIZATION': 'ORGANIZATION',
            'NATIONALITY': 'NATIONALITY',
            'OTHER': 'OTHER',
            'PERSON': 'PERSON'
        }

        if nerKeyMap is None:
            nerKeyMap = DEFAULT_NER_MAP
        else:
            for key in DEFAULT_NER_MAP:
                if key not in nerKeyMap:
                    nerKeyMap[key] = DEFAULT_NER_MAP[key]

        if spacy is True:
            nerKeyMap['GPE'] = 'LOCATION'
            nerKeyMap['NORP'] = 'NATIONALITY'
            nerKeyMap['ORG'] = 'ORGANIZATION'
            nerKeyMap['LOC'] = 'LOCATION'

        self.nerKeyMap = nerKeyMap
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0,
            "PERSON": 0.0
        }

    def geocode(self, doc):
        entities = defaultdict(list)
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.nerKeyMap
            if self.weightage[self.nerKeyMap[key]] > 0
        ]
        _ = [
            entities[self.nerKeyMap[l['neType']]].extend(
                (x.strip() for x in numstrip.sub("", l['expr']).split(",")))
            for l in doc['BasisEnrichment']['entities']
            if (len(l['expr']) > 2) and (
                l['neType'] in NAMED_ENTITY_TYPES_TO_CHECK)
        ]

        idmap = {}
        cc = set()
        for loc in entities['LOCATION']:
            loc = loc.lower()
            if loc in idmap:
                idmap[loc]['count'] += 1
            else:
                expansions = self.gazetteer.query(loc)
                resolved = False
                if len(expansions) == 1:
                    resolved = True
                    cc.add(expansions[0].countryCode.lower())
                idmap[loc] = {
                    'expansions': {exp.geonameid: exp
                                   for exp in expansions},
                    'resolved': resolved,
                    'count': 1
                }

        # check if any organization is talking about a country
        organization_checklist = {}
        for org in (entities['ORGANIZATION'] +
                    entities.get('NATIONALITY', [])):
            if org.isupper():
                continue

            org = org.lower()
            country = self.gazetteer.query(org,
                                           fuzzy='AUTO',
                                           featureCode='pcli',
                                           operator='or')
            if country:
                cc.add(country[0].countryCode.lower())
                if org in idmap:
                    idmap[org]['count'] += 1
                else:
                    idmap[org] = {
                        'expansions': {exp.geonameid: exp
                                       for exp in country},
                        'resolved': True,
                        'count': 1
                    }
            else:
                if org in organization_checklist:
                    organization_checklist[org] += 1
                else:
                    organization_checklist[org] = 1

        locdist = idmap
        if cc:
            locdist = self.fuzzyquery(idmap, organization_checklist, tuple(cc))
        #self.locdist = locdist
        #return locdist
        G, focus = self.steiner_tree_approx(locdist)
        return G, locdist, focus

    def annotate(self, doc):
        #stG, locdist
        stG, locdist, focus = self.geocode(doc)
        doc['location_distribution'] = {
            loc: locdist[loc]['expansions'][stG.neighbors(
                unidecode(loc + u"T0")).__next__()].__dict__
            for loc in locdist if locdist[loc]['expansions']
        }
        if focus:
            doc['embersGeoCode'] = doc['location_distribution'][focus[0][:-2]]
        else:
            doc['embersGeoCode'] = {}

        self.graph = stG
        return doc

    def steiner_tree_approx(self, locationMap):
        G = nx.DiGraph()
        terminalNodes = ["E"]
        for loc in locationMap:
            for rl in locationMap[loc]['expansions'].values():
                #eW = (2 - FEATURE_WEIGHTS.get(rl.featureCode, 0.00))
                eW = FEATURE_WEIGHTS.get(rl.featureCode, 12)
                nodename = unidecode(loc + u"T0")
                if rl.ltype == 'country':
                    edges = [(nodename, rl.geonameid, eW),
                             (rl.geonameid, rl.country, eW),
                             (rl.country, 'E', eW)]
                elif rl.ltype == 'admin1':
                    edges = [(nodename, rl.geonameid, eW),
                             (rl.geonameid, rl.admin1, eW),
                             (rl.admin1, rl.country, eW),
                             (rl.country, 'E', eW)]
                else:
                    #edges = [(loc + "T0", rl.geonameid, eW), (rl.geonameid, rl.name, eW), (rl.name, rl.admin1, eW), (rl.admin1, rl.country, eW), (rl.country, 'E', eW)]
                    edges = [(nodename, rl.geonameid, eW),
                             (rl.geonameid, rl.admin1, eW),
                             (rl.admin1, rl.country, eW),
                             (rl.country, 'E', eW)]

                G.add_weighted_edges_from(edges)
                terminalNodes.append(nodename)

        if G.number_of_nodes() == 0:
            return G, []

        stG = approximation.steiner_tree(G.to_undirected(), terminalNodes)

        def ego_nw_degree(degree, node):
            return sum((degree(p) for p in nx.descendants(G, node)))

        G = G.subgraph(stG)
        degree = G.degree()
        geofocus = sorted([(t, ego_nw_degree(degree, t))
                           for t in terminalNodes[1:]],
                          reverse=True)
        return G, geofocus[0] if geofocus else []

    def fuzzyquery(self, locmap, orgChecklist, countryFilter=[]):
        for loc in locmap:
            if locmap[loc]['resolved'] is False:
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                new_exp = {res.geonameid: res for res in subres}
                if new_exp:
                    # locmap[loc]['expansions'].update(new_exp)
                    locmap[loc]['expansions'] = (new_exp)

        for org in orgChecklist:
            subres = self.gazetteer.query(org, countryCode=countryFilter)
            locmap[org] = {
                "expansions": {res.geonameid: res
                               for res in subres},
                "resolved": len(subres) == 1,
                "count": orgChecklist[org]
            }
        return locmap
Beispiel #7
0
class PrepareTraining(object):
    def __init__(self, db, min_popln=0, min_length=1):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0
        }

    def geocode(self,
                doc=None,
                loclist=None,
                eKey='BasisEnrichment',
                **kwargs):
        locTexts, persons = [], []
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.weightage if self.weightage[key] > 0
        ]
        if doc is not None:
            doclength = len(doc[eKey]['tokens'])

            locTexts += [
                (numstrip.sub("", l['expr'].lower()).strip(), l['neType'],
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK)
                    and len(l['expr']) >= self.min_length)
            ]

            persons = [
                (numstrip.sub("", l['expr'].lower()).strip(),
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] == "PERSON")
                    and len(l['expr']) >= self.min_length)
            ]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        return self.geocode_fromList(locTexts,
                                     persons,
                                     doclength=doclength,
                                     **kwargs)

    def geocode_fromList(self,
                         locTexts,
                         persons,
                         results=None,
                         min_popln=None,
                         **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        meta_entInfo = {}
        realized_countries = []
        idx = 0
        offsetmat = []
        for entitem in locTexts:
            querytext, enttype, offset = entitem
            if isempty(querytext):
                continue

            if querytext in results:
                results[querytext].frequency += 1
                meta_entInfo[querytext]["offsets"].append(offset)
                meta_entInfo[querytext]["neType"] = (enttype)
                meta_entInfo[querytext]["indexes"].append(idx)
                offsetmat.append(offset)
            else:
                for subidx, substr in enumerate(querytext.split(",")):
                    substr = substr.strip()
                    if substr in results:
                        results[substr].frequency += 1
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                        continue

                    if substr not in meta_entInfo:
                        meta_entInfo[substr] = {
                            "offsets":
                            [offset + float(subidx) / kwargs['doclength']],
                            "neType":
                            enttype,
                            "indexes": [idx + subidx]
                        }
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                    else:
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])

                    ld = self._queryitem(substr,
                                         meta_entInfo[substr]["neType"])
                    if meta_entInfo[substr][
                            "neType"] != "LOCATION" and ld.isempty():
                        continue

                    results[substr] = ld
                    if len(results[substr].realizations) == 1:
                        realized_countries.append(
                            list(results[substr].realizations.values())[0]
                            ['countryCode'].lower())

                    results[substr].frequency = 1
                idx += subidx

            idx += 1

        offsetmat = np.array(offsetmat)
        offset_diffmat = offsetmat[:, np.newaxis] - offsetmat
        realized_countries = Counter(realized_countries)
        co_realized = float(sum(realized_countries.values()))
        selco = [
            kl for kl, vl in realized_countries.viewitems()
            if float(vl / co_realized) >= 0.5
        ]
        try:
            selco = realized_countries.most_common(n=1)[0][0]
        except:
            selco = []

        persons_res = {}
        for entitem in persons:
            querytext, offset = entitem
            if querytext not in persons_res:
                diffs = offsetmat - offset
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "offset":
                    diffs,
                    "freq":
                    1
                }

            else:
                persons_res[querytext]["freq"] += 1

        if not isempty(selco):
            results = self.fuzzyquery(results, countryFilter=selco)

        freqsheet = self.score(results, meta_entInfo)

        return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res

    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)

    def fuzzyquery(self, locmap, countryFilter=[]):
        for loc in locmap:
            if len(locmap[loc].realizations) != 1:
                freq = locmap[loc].frequency
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                if subres != []:
                    locmap[loc] = LocationDistribution(subres)
                    locmap[loc].frequency = freq
        return locmap

    def score(self, results, metaInfo):
        scoresheet = defaultdict(lambda: defaultdict(lambda: {
            "freq": 0.0,
            "offs_idx": []
        }))
        num_mentions = float(sum((l.frequency for l in results.values())))

        def update(key, l):
            offs = metaInfo[key]["indexes"]
            for s in l.city:
                scoresheet["city"][s]['freq'] += l.frequency
                scoresheet["city"][s]['offs_idx'] += (offs)

            for s in l.admin1:
                scoresheet["admin1"][s]["freq"] += l.frequency
                scoresheet["admin1"][s]['offs_idx'] += (offs)

            for s in l.country:
                scoresheet["country"][s]["freq"] += l.frequency
                scoresheet["country"][s]['offs_idx'] += (offs)

        _ = [update(key, val) for key, val in results.viewitems()]

        for typ in scoresheet:
            for s in scoresheet[typ]:
                scoresheet[typ][s]['freq'] /= num_mentions

            scoresheet[typ].default_factory = None

        scoresheet.default_factory = None
        return scoresheet

    def _builddoc(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs):
        """
        Attach embersGeoCode to document
        """
        eKey = None
        for key in enrichmentKeys:
            if key in doc and doc[key]:
                eKey = key

        if eKey is None:
            return doc

        all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res = self.geocode(
            doc)
        label_locs, Xmat, Ymat, pers_data, idxes = self.matchwithGSRLocs(
            doc, all_exp_locs, persons_res, offsdiffmat, freqsheet, metaInfo)
        doc['match_indexes'] = idxes
        # Xmat, Ymat = [], []
        # for loc in all_exp_locs:
        #     if all_exp_locs[loc].haslabel is True:
        #         for r in all_exp_locs[loc].realizations.values():
        #             Ymat.append(r.label)
        #             Xmat.append(self.build_featuremat(r, offsdiffmat, freqsheet))

        # label_locs, freqsheet, loctexts, metaInfo, offsdiffmat,
        return doc
        # return Xmat, Ymat, pers_data

    def build_trainingdata(self, docs):
        xmat, ymat = [], []
        for doc in docs:
            x, y = self._builddoc(doc)
            xmat += x
            ymat += y

        return xmat, ymat

    def calc_offset_stats(self, indices, diffmat):
        tril = np.tril(diffmat[indices])
        ntril = tril[np.nonzero(tril)]
        abstril = np.abs(ntril)
        if abstril.shape[0] == 0:
            return 1, 1, 1, 1

        abs_minval = np.min(abstril)
        medval = np.mean(abstril)

        try:
            before_closest = np.min(ntril[ntril > 0])
        except:
            before_closest = 1

        try:
            after_closest = abs(np.max(ntril[ntril < 0]))
        except:
            after_closest = 1

        return medval, abs_minval, before_closest, after_closest

    def _single_build_featuremat(self, realization, diffmat, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        offs = freqsheet["country"][country + "//"]["offs_idx"]
        co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)

        offs = freqsheet["admin1"][admin + "/"]["offs_idx"]
        st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
        if realization.featureCode[:3] not in ("adm1", "pcli"):
            try:
                offs = freqsheet["city"][city]["offs_idx"]
                ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
                cifreq = freqsheet["city"][city]["freq"]
            except:
                ci_offset = [1, 1, 1, 1]
                cifreq = 0
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": freqsheet["admin1"][admin + "/"]["freq"],
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }

    def build_persmat(self, realization, meta_info, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        co_offset = self.calc_offset_stats(
            freqsheet["country"][country + "//"]["offs_idx"],
            meta_info['offset'])

        if (admin + "/") in freqsheet["admin1"]:
            st_offset = self.calc_offset_stats(
                freqsheet["admin1"][admin + "/"]["offs_idx"],
                meta_info["offset"])
            st_freq = freqsheet["admin1"][admin + "/"]["freq"]
        else:
            st_offset = [1, 1, 1, 1]
            st_freq = meta_info['freq']

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            if city in freqsheet["city"]:
                ci_offset = self.calc_offset_stats(
                    freqsheet["city"][city]["offs_idx"], meta_info["offset"])
                cifreq = freqsheet["city"][city]["freq"]
            else:
                ci_offset = [1, 1, 1, 1]
                cifreq = meta_info["freq"]
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": st_freq,
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }
        #self.build_persmat(persons_res[loc].realizations[x])

    def build_featuremat(self, loc, *args):
        xmat, ymat = [], []
        for real in loc.realizations:
            x = self._single_build_featuremat(loc.realizations[real], *args)
            y = loc.realizations[real].label
            xmat.append(x)
            ymat.append(y)

        return xmat, ymat

    def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat,
                         freqsheet, metaInfo):
        locstrings = set()
        matched_idx = []
        for evt in doc['events']:
            estr = u"/".join(
                [evt['Country'], evt['State'], evt['City'].lower()])
            locstrings.add(estr.lower())
            if 'expanded_loc' in evt:
                try:
                    loc = self.gazetteer.get_locInfo(country=evt['Country'],
                                                     admin=evt['State'],
                                                     city=evt["City"])
                    evt['expanded_loc'] = loc
                except Exception as e:
                    pass

            if "expanded_loc" in evt:
                for loc in evt['expanded_loc']:
                    gp = GeoPoint(**loc)
                    lstr = "/".join([
                        gp.country, gp.admin1,
                        (getattr(gp, "admin2", "") or gp.city)
                    ])
                    locstrings.add(lstr.lower())

        matched_locs = set()
        xmat, ymat = [], []
        for loc in all_exp_locs:
            all_exp_locs[loc].haslabel = False
            for x in all_exp_locs[loc].realizations:
                if x.lower() in locstrings:
                    all_exp_locs[loc].realizations[x].label = True
                    all_exp_locs[loc].haslabel = True
                    matched_locs.add(x.lower())
                    matched_idx.append([loc, metaInfo[loc]['indexes']])
                else:
                    all_exp_locs[loc].realizations[x].label = False

            if all_exp_locs[loc].haslabel:
                x, y = self.build_featuremat(all_exp_locs[loc], offsdiffmat,
                                             freqsheet)
                xmat.append(x)
                ymat.append(y)

        remaininglocs = locstrings - matched_locs
        pers_data = [[], []]
        for loc in persons_res:
            persons_res[loc]['expansions'].haslabel = False
            for x in persons_res[loc]["expansions"].realizations:
                d1 = self.build_persmat(
                    persons_res[loc]["expansions"].realizations[x],
                    persons_res[loc], freqsheet)
                pers_data[0].append(d1)
                if x.lower() in remaininglocs:
                    persons_res[loc]["expansions"].haslabel = True
                    persons_res[loc]["expansions"].realizations[x].label = True
                    matched_idx.append([loc, None])
                    pers_data[1].append(True)
                else:
                    pers_data[1].append(False)

        return all_exp_locs, xmat, ymat, pers_data, matched_idx