Beispiel #1
0
    def get_geoPoints_intersection(self, gps):
        try:
            selcountry = set.intersection(
                *[set([l.country]) for name in gps for l in gps[name]])
        except:
            selcountry = None

        if not selcountry:
            return gps

        selcountry = selcountry.pop()
        filtered_gps = [
            set([encode('/'.join([l.country, l.admin1, ""]))]) for name in gps
            for l in gps[name] if l.country == selcountry
        ]

        sel_admin1 = set.intersection(*filtered_gps)
        if not sel_admin1:
            return {
                name: [l for l in gps[name] if l.country == selcountry]
                for name in gps
            }

        sel_admin1 = sel_admin1.pop()
        ns = {}
        for l in gps:
            t_admin = [gp for gp in gps[l] if gp.__str__() == sel_admin1]
            if t_admin != []:
                ns[l] = t_admin
                continue
            t_cand = [
                gp for gp in gps[l]
                if encode("/".join([gp.country, gp.admin1, ""])) == sel_admin1
            ]
            ns[l] = t_cand
        return ns
Beispiel #2
0
    def geocode_fromList(self, locTexts, results=None, min_popln=None, **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        itype = {}
        for l in locTexts:
            if l == "":
                continue
            if isinstance(l, tuple):
                itype[l[0]] = l[1]
                l = l[0]
            else:
                itype[l] = 'LOCATION'
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    for sub in l.split(","):
                        sub = sub.strip()
                        if sub in results:
                            results[sub].frequency += 1
                        else:
                            itype[sub] = itype[l]
                            try:
                                # # Exclusion
                                # list_exclude = ['city','town']
                                # for ext in list_exclude:
                                #     if ext in sub:
                                #         sub = sub.replace(ext, "")
                                query = self.gazetteer.query(sub, min_popln=min_popln,**kwargs)
                                results[sub] = LocationDistribution(query)
                            except UnicodeDecodeError:
                                ipdb.set_trace()
                            results[sub].frequency = 1
            except UnicodeDecodeError:
                log.exception("Unable to make query for string - {}".format(encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(),
                                   key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        total_weight = sum([self.weightage[itype.get(key, 'OTHER')] for key in lmap])
        return lmap, max(lmap.items(),
                         key=lambda x: x[1]['score'] * self.weightage[itype.get(x[0], 'OTHER')] / total_weight)[1]['geo_point'] if scores else {}
Beispiel #3
0
    def geocode_fromList(self, locTexts, results=None, min_popln=None):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        for l in locTexts:
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    q = self.gazetteer.query(l, min_popln=min_popln)
                    if not q:
                        for sub in l.split(","):
                            sub = sub.strip()
                            if sub in results:
                                results[sub].frequency += 1
                            else:
                                results[sub] = LocationDistribution(
                                    self.gazetteer.query(sub,
                                                         min_popln=min_popln))
                                results[sub].frequency = 1
                    else:
                        results[l] = LocationDistribution(q)
                        results[l].frequency = 1
            except:
                log.exception("Unable to make query for string - {}".format(
                    encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        #ipdb.set_trace()
        return lmap, max(
            lmap.values(),
            key=lambda x: x['score'])['geo_point'] if scores else {}
Beispiel #4
0
        # with io.open(args.outfile, 'wb', encoding='utf8') as outfile:
        #     for ln in articles:
        #         # Convert Python Object (Dict) to JSON
        #         str_ = json.dumps(ln, sort_keys=True, ensure_ascii=False)
        #         outfile.write(to_unicode(str_) + "\n")


    else:

        for l in infile:
            try:
                j = json.loads(l)
                j = GEO.annotate(j)
                #log.debug("geocoded line no:{}, {}".format(lno,
                #                                           encode(j.get("link", ""))))
                lno += 1
                outfile.write(encode(json.dumps(j, ensure_ascii=False) + "\n"))
            except UnicodeEncodeError:
                log.exception("Unable to readline")
                continue

    t2 = time.time()
    passed_time = t2 - t1
    print('Time duration: ', passed_time)

    if not args.cat:
        infile.close()
        outfile.close()

    exit(0)
Beispiel #5
0
    def geocode_fromList(self,
                         locTexts,
                         persons,
                         results=None,
                         min_popln=None,
                         **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        itype = {}
        realized_countries = []
        for l in locTexts:
            if l == "":
                continue
            if isinstance(l, tuple):
                itype[l[0]] = l[1]
                l = l[0]
            else:
                itype[l] = 'LOCATION'
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    for sub in l.split(","):
                        sub = sub.strip()
                        if sub in results:
                            results[sub].frequency += 1
                        else:
                            itype[sub] = itype[l]
                            try:
                                results[sub] = self._queryitem(sub, itype[sub])
                                if len(results[sub].realizations) == 1:
                                    realized_countries.append(
                                        list(
                                            results[sub].realizations.values())
                                        [0]['countryCode'].lower())
                            except UnicodeDecodeError:
                                ipdb.set_trace()
                            results[sub].frequency = 1
            except UnicodeDecodeError:
                log.exception("Unable to make query for string - {}".format(
                    encode(l)))


#         realized_countries = Counter(realized_countries)
#         co_realized = float(sum(realized_countries.values()))
#         selco = [kl for kl, vl in realized_countries.viewitems()
#                                                  if float(vl/co_realized) >= 0.5]
#         try:
#             selco = realized_countries.most_common(n=1)[0][0]
#         except:
#             selco = []
        selco = list(set(realized_countries))

        if selco not in (None, "", []):
            results = self.fuzzyquery(results, countryFilter=selco)

        persons_res = {}
        for entitem in persons:
            querytext, _ = entitem
            if querytext not in persons_res:
                persons_res[querytext] = {
                    "expansions":
                    self._queryitem(querytext, "LOCATION", countryCode=selco),
                    "freq":
                    1
                }
                if querytext not in results:
                    results[querytext] = persons_res[querytext]['expansions']
                    results[querytext].frequency = 1
            else:
                persons_res[querytext]["freq"] += 1
                results[querytext].frequency += 1

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        total_weight = sum(
            [self.weightage[itype.get(key, 'OTHER')] for key in lmap])
        return lmap, max(lmap.items(),
                         key=lambda x: x[1]['score'] * self.weightage[
                             itype.get(x[0], 'OTHER')] / total_weight
                         )[1]['geo_point'] if scores else {}