Example #1
0
class SupervisedGeo(object):
    def __init__(self,
                 db,
                 min_popln=0,
                 min_length=1,
                 model="./geoModels/rf_geo.pkl"):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0
        }
        with open(model, "rb") as inf:
            self.model = pickle.load(inf)

    def _build_data(self,
                    doc=None,
                    loclist=None,
                    eKey='BasisEnrichment',
                    **kwargs):
        locTexts, persons = [], []
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.weightage if self.weightage[key] > 0
        ]
        if doc is not None:
            doclength = len(doc[eKey]['tokens'])

            locTexts += [
                (numstrip.sub("", l['expr'].lower()).strip(), l['neType'],
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK)
                    and len(l['expr']) >= self.min_length)
            ]

            persons = [
                (numstrip.sub("", l['expr'].lower()).strip(),
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] == "PERSON")
                    and len(l['expr']) >= self.min_length)
            ]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        return self._esquery_fromList(locTexts,
                                      persons,
                                      doclength=doclength,
                                      **kwargs)

    def _esquery_fromList(self,
                          locTexts,
                          persons,
                          results=None,
                          min_popln=None,
                          **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        meta_entInfo = {}
        realized_countries = []
        idx = 0
        offsetmat = []
        for entitem in locTexts:
            querytext, enttype, offset = entitem
            if isempty(querytext):
                continue

            if querytext in results:
                results[querytext].frequency += 1
                meta_entInfo[querytext]["offsets"].append(offset)
                meta_entInfo[querytext]["neType"] = (enttype)
                meta_entInfo[querytext]["indexes"].append(idx)
                offsetmat.append(offset)
            else:
                for subidx, substr in enumerate(querytext.split(",")):
                    substr = substr.strip()
                    if substr in results:
                        results[substr].frequency += 1
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                        continue

                    if substr not in meta_entInfo:
                        meta_entInfo[substr] = {
                            "offsets":
                            [offset + float(subidx) / kwargs['doclength']],
                            "neType":
                            enttype,
                            "indexes": [idx + subidx]
                        }
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                    else:
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])

                    ld = self._queryitem(substr,
                                         meta_entInfo[substr]["neType"])
                    if meta_entInfo[substr][
                            "neType"] != "LOCATION" and ld.isempty():
                        continue

                    results[substr] = ld
                    if len(results[substr].realizations) == 1:
                        realized_countries.append(
                            list(results[substr].realizations.values())[0]
                            ['countryCode'].lower())

                    results[substr].frequency = 1
                idx += subidx

            idx += 1

        offsetmat = np.array(offsetmat)
        offset_diffmat = offsetmat[:, np.newaxis] - offsetmat
        selco = realized_countries
        #realized_countries = Counter(realized_countries)
        #co_realized = float(sum(realized_countries.values()))
        #selco = [kl for kl, vl in realized_countries.viewitems()
        #         if float(vl/co_realized) >= 0.5]
        #try:
        #    selco = realized_countries.most_common(n=1)[0][0]
        #except:
        #    selco = []

        persons_res = {}
        for entitem in persons:
            querytext, offset = entitem
            if querytext not in persons_res:
                diffs = offsetmat - offset
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "offset":
                    diffs,
                    "freq":
                    1
                }

            else:
                persons_res[querytext]["freq"] += 1

        if not isempty(selco):
            results = self.fuzzyquery(results, countryFilter=selco)

        freqsheet = self.score(results, meta_entInfo)

        return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res, selco

    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)

    def fuzzyquery(self, locmap, countryFilter=[]):
        for loc in locmap:
            if len(locmap[loc].realizations) != 1:
                freq = locmap[loc].frequency
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                if subres != []:
                    locmap[loc] = LocationDistribution(
                        subres + locmap[loc].realizations.values())
                    locmap[loc].frequency = freq
        return locmap

    def score(self, results, metaInfo):
        scoresheet = defaultdict(lambda: defaultdict(lambda: {
            "freq": 0.0,
            "offs_idx": []
        }))
        num_mentions = float(sum((l.frequency for l in results.values())))

        def update(key, l):
            offs = metaInfo[key]["indexes"]
            for s in l.city:
                scoresheet["city"][s]['freq'] += l.frequency
                scoresheet["city"][s]['offs_idx'] += (offs)

            for s in l.admin1:
                scoresheet["admin1"][s]["freq"] += l.frequency
                scoresheet["admin1"][s]['offs_idx'] += (offs)

            for s in l.country:
                scoresheet["country"][s]["freq"] += l.frequency
                scoresheet["country"][s]['offs_idx'] += (offs)

        _ = [update(key, val) for key, val in results.viewitems()]

        for typ in scoresheet:
            for s in scoresheet[typ]:
                scoresheet[typ][s]['freq'] /= num_mentions

            scoresheet[typ].default_factory = None

        scoresheet.default_factory = None
        return scoresheet

    def geocode(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs):
        """
        Attach embersGeoCode to document
        """
        eKey = None
        for key in enrichmentKeys:
            if key in doc and doc[key]:
                eKey = key

        if eKey is None:
            return doc

        all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res, selco = self._build_data(
            doc)
        if "events" in doc:
            self._expand_events(doc)

        locdist = {}
        clfdata = {}
        for loc in all_exp_locs:
            x, names = self.build_featuremat(all_exp_locs[loc], offsdiffmat,
                                             freqsheet)
            if x != []:
                clfdata[loc] = zip(names, x)
                ypred = self.model[1].predict_proba(
                    self.model[0].transform(x))[:, 1]
                prob, final_nm = max(zip(ypred, names), key=lambda lx: lx[0])
                locdist[loc] = {
                    "conf": prob,
                    "details":
                    all_exp_locs[loc].realizations[final_nm].__dict__
                }

        person_dist = {}
        for loc in persons_res:
            exps = persons_res[loc]["expansions"]
            x, names = [], []
            for real in exps.realizations:
                d1 = self.build_persmat(exps.realizations[real],
                                        persons_res[loc], freqsheet)
                x.append(d1)
                names.append(real)

            if x != []:
                clfdata[loc] = zip(names, x)
                ypred = self.model[1].predict(self.model[0].transform(x))
                pred, nm = max(zip(ypred, names), key=lambda lx: lx[0])
                if pred is True:
                    person_dist[loc] = exps.realizations[nm].__dict__

        true_geos = self.matchwithGSRLocs(doc, all_exp_locs, persons_res,
                                          offsdiffmat, freqsheet)
        doc['true_geos'] = true_geos
        doc['location_distribution'] = locdist
        doc['person_dist'] = person_dist
        doc['geo_debug'] = {"selco": selco, "clfdata": clfdata}
        return doc

    def calc_offset_stats(self, indices, diffmat):
        tril = np.tril(diffmat[indices])
        ntril = tril[np.nonzero(tril)]
        abstril = np.abs(ntril)
        if abstril.shape[0] == 0:
            return 1, 1, 1, 1

        abs_minval = np.min(abstril)
        medval = np.mean(abstril)

        try:
            before_closest = np.min(ntril[ntril > 0])
        except:
            before_closest = 1

        try:
            after_closest = abs(np.max(ntril[ntril < 0]))
        except:
            after_closest = 1

        return medval, abs_minval, before_closest, after_closest

    def _single_build_featuremat(self, realization, diffmat, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        offs = freqsheet["country"][country + "//"]["offs_idx"]
        co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)

        try:
            offs = freqsheet["admin1"][admin + "/"]["offs_idx"]
            st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
        except:
            st_offset = [0, 0, 0, 0]

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            try:
                offs = freqsheet["city"][city]["offs_idx"]
                ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
                cifreq = freqsheet["city"][city]["freq"]
            except:
                ci_offset = [1, 1, 1, 1]
                cifreq = 0
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country":
            freqsheet["country"][country + "//"]["freq"],
            "state":
            freqsheet.get("admin1", {}).get(admin + "/", {}).get("freq", 0),
            "city":
            cifreq,
            "poplnConf":
            realization.poplnConf,
            "co_Offmean":
            co_offset[0],
            "co_Offmin":
            co_offset[1],
            "co_prev":
            co_offset[2],
            "co_after":
            co_offset[3],
            "st_offmean":
            st_offset[0],
            "st_offmin":
            st_offset[1],
            "st_prev":
            st_offset[2],
            "st_after":
            st_offset[3],
            "ci_offmean":
            ci_offset[0],
            "ci_offmin":
            ci_offset[1],
            "ci_prev":
            ci_offset[2],
            "ci_after":
            ci_offset[3]
        }

    def build_persmat(self, realization, meta_info, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        co_offset = self.calc_offset_stats(
            freqsheet["country"][country + "//"]["offs_idx"],
            meta_info['offset'])

        if (admin + "/") in freqsheet["admin1"]:
            st_offset = self.calc_offset_stats(
                freqsheet["admin1"][admin + "/"]["offs_idx"],
                meta_info["offset"])
            st_freq = freqsheet["admin1"][admin + "/"]["freq"]
        else:
            st_offset = [1, 1, 1, 1]
            st_freq = meta_info['freq']

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            if city in freqsheet.get("city", {}):
                ci_offset = self.calc_offset_stats(
                    freqsheet["city"][city]["offs_idx"], meta_info["offset"])
                cifreq = freqsheet["city"][city]["freq"]
            else:
                ci_offset = [1, 1, 1, 1]
                cifreq = meta_info["freq"]
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": st_freq,
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }
        #self.build_persmat(persons_res[loc].realizations[x])

    def build_featuremat(self, loc, *args):
        xmat = []
        lbls = []
        for real in loc.realizations:
            x = self._single_build_featuremat(loc.realizations[real], *args)
            lbls.append(real)
            xmat.append(x)

        #if xmat != []:
        #    xmat = self.model[0].transform(xmat)

        return xmat, lbls

    def _expand_events(self, doc):
        for evt in doc["events"]:
            if "expanded_loc" in evt:
                continue

            try:
                loc = self.gazetteer.get_locInfo(country=evt['Country'],
                                                 admin=evt['State'],
                                                 city=evt["City"])
                evt['expanded_loc'] = loc
            except Exception as e:
                pass
        return

    def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat,
                         freqsheet):
        locstrings = set()
        for evt in doc['events']:
            estr = u"/".join([evt['Country'], evt['State'], evt['City']])
            locstrings.add(estr)
            if "expanded_loc" in evt:
                for loc in evt['expanded_loc']:
                    gp = GeoPoint(**loc)
                    lstr = "/".join([
                        gp.country, gp.admin1,
                        (getattr(gp, "admin2", "") or gp.city)
                    ])
                    locstrings.add(lstr)

        matched_locs = set()
        true_geos = {'persons': {}, 'locations': {}}
        for loc in all_exp_locs:
            for x in all_exp_locs[loc].realizations:
                if x in locstrings:
                    true_geos['locations'][loc] = all_exp_locs[
                        loc].realizations[x].__dict__

        remaininglocs = locstrings - matched_locs
        for loc in persons_res:
            for x in persons_res[loc]["expansions"].realizations:
                if x in remaininglocs:
                    true_geos['persons'][loc] = persons_res[loc][
                        "expansions"].realizations[x].__dict__

        return true_geos
Example #2
0
class PrepareTraining(object):
    def __init__(self, db, min_popln=0, min_length=1):
        self.gazetteer = GeoNames(db)
        self.min_popln = min_popln
        self.min_length = min_length
        self.weightage = {
            "LOCATION": 1.0,
            "NATIONALITY": 0.75,
            "ORGANIZATION": 0.5,
            "OTHER": 0.0
        }

    def geocode(self,
                doc=None,
                loclist=None,
                eKey='BasisEnrichment',
                **kwargs):
        locTexts, persons = [], []
        NAMED_ENTITY_TYPES_TO_CHECK = [
            key for key in self.weightage if self.weightage[key] > 0
        ]
        if doc is not None:
            doclength = len(doc[eKey]['tokens'])

            locTexts += [
                (numstrip.sub("", l['expr'].lower()).strip(), l['neType'],
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK)
                    and len(l['expr']) >= self.min_length)
            ]

            persons = [
                (numstrip.sub("", l['expr'].lower()).strip(),
                 (sum([int(_)
                       for _ in l['offset'].split(":")])) / (2.0 * doclength))
                for l in doc[eKey]["entities"]
                if ((l["neType"] == "PERSON")
                    and len(l['expr']) >= self.min_length)
            ]

        if loclist is not None:
            locTexts += [l.lower() for l in loclist]

        return self.geocode_fromList(locTexts,
                                     persons,
                                     doclength=doclength,
                                     **kwargs)

    def geocode_fromList(self,
                         locTexts,
                         persons,
                         results=None,
                         min_popln=None,
                         **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        meta_entInfo = {}
        realized_countries = []
        idx = 0
        offsetmat = []
        for entitem in locTexts:
            querytext, enttype, offset = entitem
            if isempty(querytext):
                continue

            if querytext in results:
                results[querytext].frequency += 1
                meta_entInfo[querytext]["offsets"].append(offset)
                meta_entInfo[querytext]["neType"] = (enttype)
                meta_entInfo[querytext]["indexes"].append(idx)
                offsetmat.append(offset)
            else:
                for subidx, substr in enumerate(querytext.split(",")):
                    substr = substr.strip()
                    if substr in results:
                        results[substr].frequency += 1
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                        continue

                    if substr not in meta_entInfo:
                        meta_entInfo[substr] = {
                            "offsets":
                            [offset + float(subidx) / kwargs['doclength']],
                            "neType":
                            enttype,
                            "indexes": [idx + subidx]
                        }
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])
                    else:
                        meta_entInfo[substr]["offsets"].append(
                            offset + float(subidx) / kwargs['doclength'])
                        meta_entInfo[substr]["neType"] = (enttype)
                        meta_entInfo[substr]["indexes"].append(idx + subidx)
                        offsetmat.append(offset +
                                         float(subidx) / kwargs['doclength'])

                    ld = self._queryitem(substr,
                                         meta_entInfo[substr]["neType"])
                    if meta_entInfo[substr][
                            "neType"] != "LOCATION" and ld.isempty():
                        continue

                    results[substr] = ld
                    if len(results[substr].realizations) == 1:
                        realized_countries.append(
                            list(results[substr].realizations.values())[0]
                            ['countryCode'].lower())

                    results[substr].frequency = 1
                idx += subidx

            idx += 1

        offsetmat = np.array(offsetmat)
        offset_diffmat = offsetmat[:, np.newaxis] - offsetmat
        realized_countries = Counter(realized_countries)
        co_realized = float(sum(realized_countries.values()))
        selco = [
            kl for kl, vl in realized_countries.viewitems()
            if float(vl / co_realized) >= 0.5
        ]
        try:
            selco = realized_countries.most_common(n=1)[0][0]
        except:
            selco = []

        persons_res = {}
        for entitem in persons:
            querytext, offset = entitem
            if querytext not in persons_res:
                diffs = offsetmat - offset
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "offset":
                    diffs,
                    "freq":
                    1
                }

            else:
                persons_res[querytext]["freq"] += 1

        if not isempty(selco):
            results = self.fuzzyquery(results, countryFilter=selco)

        freqsheet = self.score(results, meta_entInfo)

        return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res

    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)

    def fuzzyquery(self, locmap, countryFilter=[]):
        for loc in locmap:
            if len(locmap[loc].realizations) != 1:
                freq = locmap[loc].frequency
                subres = self.gazetteer.query(loc,
                                              countryCode=countryFilter,
                                              fuzzy='AUTO')
                if subres != []:
                    locmap[loc] = LocationDistribution(subres)
                    locmap[loc].frequency = freq
        return locmap

    def score(self, results, metaInfo):
        scoresheet = defaultdict(lambda: defaultdict(lambda: {
            "freq": 0.0,
            "offs_idx": []
        }))
        num_mentions = float(sum((l.frequency for l in results.values())))

        def update(key, l):
            offs = metaInfo[key]["indexes"]
            for s in l.city:
                scoresheet["city"][s]['freq'] += l.frequency
                scoresheet["city"][s]['offs_idx'] += (offs)

            for s in l.admin1:
                scoresheet["admin1"][s]["freq"] += l.frequency
                scoresheet["admin1"][s]['offs_idx'] += (offs)

            for s in l.country:
                scoresheet["country"][s]["freq"] += l.frequency
                scoresheet["country"][s]['offs_idx'] += (offs)

        _ = [update(key, val) for key, val in results.viewitems()]

        for typ in scoresheet:
            for s in scoresheet[typ]:
                scoresheet[typ][s]['freq'] /= num_mentions

            scoresheet[typ].default_factory = None

        scoresheet.default_factory = None
        return scoresheet

    def _builddoc(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs):
        """
        Attach embersGeoCode to document
        """
        eKey = None
        for key in enrichmentKeys:
            if key in doc and doc[key]:
                eKey = key

        if eKey is None:
            return doc

        all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res = self.geocode(
            doc)
        label_locs, Xmat, Ymat, pers_data, idxes = self.matchwithGSRLocs(
            doc, all_exp_locs, persons_res, offsdiffmat, freqsheet, metaInfo)
        doc['match_indexes'] = idxes
        # Xmat, Ymat = [], []
        # for loc in all_exp_locs:
        #     if all_exp_locs[loc].haslabel is True:
        #         for r in all_exp_locs[loc].realizations.values():
        #             Ymat.append(r.label)
        #             Xmat.append(self.build_featuremat(r, offsdiffmat, freqsheet))

        # label_locs, freqsheet, loctexts, metaInfo, offsdiffmat,
        return doc
        # return Xmat, Ymat, pers_data

    def build_trainingdata(self, docs):
        xmat, ymat = [], []
        for doc in docs:
            x, y = self._builddoc(doc)
            xmat += x
            ymat += y

        return xmat, ymat

    def calc_offset_stats(self, indices, diffmat):
        tril = np.tril(diffmat[indices])
        ntril = tril[np.nonzero(tril)]
        abstril = np.abs(ntril)
        if abstril.shape[0] == 0:
            return 1, 1, 1, 1

        abs_minval = np.min(abstril)
        medval = np.mean(abstril)

        try:
            before_closest = np.min(ntril[ntril > 0])
        except:
            before_closest = 1

        try:
            after_closest = abs(np.max(ntril[ntril < 0]))
        except:
            after_closest = 1

        return medval, abs_minval, before_closest, after_closest

    def _single_build_featuremat(self, realization, diffmat, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        offs = freqsheet["country"][country + "//"]["offs_idx"]
        co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)

        offs = freqsheet["admin1"][admin + "/"]["offs_idx"]
        st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
        if realization.featureCode[:3] not in ("adm1", "pcli"):
            try:
                offs = freqsheet["city"][city]["offs_idx"]
                ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat)
                cifreq = freqsheet["city"][city]["freq"]
            except:
                ci_offset = [1, 1, 1, 1]
                cifreq = 0
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": freqsheet["admin1"][admin + "/"]["freq"],
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }

    def build_persmat(self, realization, meta_info, freqsheet):
        country = realization.country
        admin = "/".join([country, realization.admin1])
        city = "/".join(
            [admin,
             getattr(realization, "admin2", "") or realization.city])
        featureCode = realization.featureCode
        co_offset = self.calc_offset_stats(
            freqsheet["country"][country + "//"]["offs_idx"],
            meta_info['offset'])

        if (admin + "/") in freqsheet["admin1"]:
            st_offset = self.calc_offset_stats(
                freqsheet["admin1"][admin + "/"]["offs_idx"],
                meta_info["offset"])
            st_freq = freqsheet["admin1"][admin + "/"]["freq"]
        else:
            st_offset = [1, 1, 1, 1]
            st_freq = meta_info['freq']

        if realization.featureCode[:3] not in ("adm1", "pcli"):
            if city in freqsheet["city"]:
                ci_offset = self.calc_offset_stats(
                    freqsheet["city"][city]["offs_idx"], meta_info["offset"])
                cifreq = freqsheet["city"][city]["freq"]
            else:
                ci_offset = [1, 1, 1, 1]
                cifreq = meta_info["freq"]
        else:
            ci_offset = [0, 0, 0, 0]
            cifreq = 0

        return {
            "country": freqsheet["country"][country + "//"]["freq"],
            "state": st_freq,
            "city": cifreq,
            "poplnConf": realization.poplnConf,
            "co_Offmean": co_offset[0],
            "co_Offmin": co_offset[1],
            "co_prev": co_offset[2],
            "co_after": co_offset[3],
            "st_offmean": st_offset[0],
            "st_offmin": st_offset[1],
            "st_prev": st_offset[2],
            "st_after": st_offset[3],
            "ci_offmean": ci_offset[0],
            "ci_offmin": ci_offset[1],
            "ci_prev": ci_offset[2],
            "ci_after": ci_offset[3]
        }
        #self.build_persmat(persons_res[loc].realizations[x])

    def build_featuremat(self, loc, *args):
        xmat, ymat = [], []
        for real in loc.realizations:
            x = self._single_build_featuremat(loc.realizations[real], *args)
            y = loc.realizations[real].label
            xmat.append(x)
            ymat.append(y)

        return xmat, ymat

    def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat,
                         freqsheet, metaInfo):
        locstrings = set()
        matched_idx = []
        for evt in doc['events']:
            estr = u"/".join(
                [evt['Country'], evt['State'], evt['City'].lower()])
            locstrings.add(estr.lower())
            if 'expanded_loc' in evt:
                try:
                    loc = self.gazetteer.get_locInfo(country=evt['Country'],
                                                     admin=evt['State'],
                                                     city=evt["City"])
                    evt['expanded_loc'] = loc
                except Exception as e:
                    pass

            if "expanded_loc" in evt:
                for loc in evt['expanded_loc']:
                    gp = GeoPoint(**loc)
                    lstr = "/".join([
                        gp.country, gp.admin1,
                        (getattr(gp, "admin2", "") or gp.city)
                    ])
                    locstrings.add(lstr.lower())

        matched_locs = set()
        xmat, ymat = [], []
        for loc in all_exp_locs:
            all_exp_locs[loc].haslabel = False
            for x in all_exp_locs[loc].realizations:
                if x.lower() in locstrings:
                    all_exp_locs[loc].realizations[x].label = True
                    all_exp_locs[loc].haslabel = True
                    matched_locs.add(x.lower())
                    matched_idx.append([loc, metaInfo[loc]['indexes']])
                else:
                    all_exp_locs[loc].realizations[x].label = False

            if all_exp_locs[loc].haslabel:
                x, y = self.build_featuremat(all_exp_locs[loc], offsdiffmat,
                                             freqsheet)
                xmat.append(x)
                ymat.append(y)

        remaininglocs = locstrings - matched_locs
        pers_data = [[], []]
        for loc in persons_res:
            persons_res[loc]['expansions'].haslabel = False
            for x in persons_res[loc]["expansions"].realizations:
                d1 = self.build_persmat(
                    persons_res[loc]["expansions"].realizations[x],
                    persons_res[loc], freqsheet)
                pers_data[0].append(d1)
                if x.lower() in remaininglocs:
                    persons_res[loc]["expansions"].haslabel = True
                    persons_res[loc]["expansions"].realizations[x].label = True
                    matched_idx.append([loc, None])
                    pers_data[1].append(True)
                else:
                    pers_data[1].append(False)

        return all_exp_locs, xmat, ymat, pers_data, matched_idx