Python Geo Examples, embers.geocode.Geo Python Examples

Example #1

0

Show file

File: warningUtils.py Project: sauravcsvt/python_utils

    def parse(self, gsrObj, geo=False):
        if geo:
            self.embersgeo = Geo()

        if isinstance(gsrObj, file):
            gsr = [self._formatcheck(json.loads(l), geo) for l in gsrObj if l.strip()]

        elif isinstance(gsrObj, basestring):
            with open(gsrObj) as gfile:
                gsr = [self._formatcheck(json.loads(l), geo) for l in gfile if l.strip()]

        elif isinstance(gsrObj, list):
            gsr = [self._formatcheck(j, geo) for j in gsrObj]

        else:
            raise NotImplementedError

        gsr_df = self._dfmap(gsr)
        return gsr_df

Example #2

0

Show file

File: geo_code_stream.py Project: sauravcsvt/geocoding

def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(
                            json.dumps(tweet,
                                       ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry, ))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          (tweet, ))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1

Example #3

0

Show file

File: regression_model.py Project: aravindmahi/ms_thesis

 def collectMentions(self):
     geo = Geo()
     tweetCount = 0
     tweetErrorCount = 0
     totalFiles = len(os.listdir(self.inputFolder))
     fileCount = 1
     for _file in sorted(os.listdir(self.inputFolder)):
         fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d")
         if(fileDate > self.toDate or fileDate < self.fromDate):
             continue
         log.debug("processing file %d/%d-->%s" % (fileCount, totalFiles, _file))
         fileCount += 1
         try:
             with open(self.inputFolder + "/" + _file, "r") as FILE:
                 for line in FILE:
                     try:
                         jsonTweet = json.loads(line.strip())
                         geoList = geo.geo_normalize(jsonTweet)
                         city = geoList[0]
                         country = geoList[1]
                         state = geoList[2]
                         if ((self.city == '-' and self.state == '-' and country and country.lower() == self.country) or
                                 (country and country.lower() == self.country and state and state.lower() == self.state) or
                                 (country and country.lower() == self.country and state and state.lower() == self.state and city and city.lower() == self.city)):
                             tweetCount += 1
                             # use [5:25] if need HH:MM:SS
                             datestr = jsonTweet["interaction"]["created_at"][5:16]
                             klout, sentiment = getKloutSentiment(jsonTweet)
                             tweeterId = jsonTweet["interaction"]["author"]["id"]
                             candidatesFound = self.processTweet(jsonTweet)
                             self.updateScoreCard(candidatesFound,
                                                  tweeterId, klout,
                                                  sentiment, datestr)
                     except Exception, f:
                         log.exception("error processing tweets %s", f)
                         tweetErrorCount += 1
         except Exception, e:
             log.exception("error processfing file %s", e)

Example #4

0

Show file

File: getOfficialHashTags.py Project: andytwigg/ms_thesis

def execute(arg):
    logs.init(arg)

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    tweetFolder = arg.tweetFolder
    country = arg.country

    hashTagCounts = {}
    uids = {}

    # loading twitter handles from a file
    with open(arg.seedFile, 'r') as _file:
        for line in _file:
            handle, candidate = line.strip().split(',')
            if candidate not in uids:
                uids[candidate] = []
                hashTagCounts[candidate] = {}
                uids[candidate].append(handle.lower())
            else:
                uids[candidate].append(handle.lower())

    # for geolocation
    geo = Geo()

    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')
        if (fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                userId, realName = None, None
                                if 'twiiter' in jsonTweet:
                                    if 'user' in jsonTweet['twitter']:
                                        if 'screen_name' in jsonTweet['twitter']['user']:
                                            userId = jsonTweet['twitter']['user']['screen_name'].lower()
                                        if 'name' in jsonTweet['twitter']['user']:
                                            realName = jsonTweet['twitter']['user']['name'].lower()
                                if userId is None and realName is None:
                                    continue
                                log.debug('userId or realName is not None')
                                candidate = getCandidate(userId, realName, uids)
                                if candidate is not None:
                                    log.debug('found candidate--> ' + candidate)
                                    # prereProcess the tweet
                                    text = jsonTweet["interaction"]["content"]
                                    text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                    text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                    text = re.sub(' +', ' ', text)  # remove multiple spaces
                                    hashTags = extract_hash_tags(text)
                                    hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                    for hashTag in hashTags:
                                        if hashTag.startswith('#'):
                                            hashTag = hashTag[1:]
                                        if hashTag in hashTagCounts[candidate]:
                                            hashTagCounts[candidate][hashTag] += 1
                                        else:
                                            hashTagCounts[candidate][hashTag] = 1
                        except Exception, e:
                            log.exception('error processing tweet %s' % e)
            except Exception, f:
                log.exception('error processing file %s' % f)
        else:
            log.debug('skipping file %s ' % _file)

Example #5

0

Show file

File: evaluate.py Project: sathappanspm/geocoding

__version__ = "0.0.1"

import json
import gzip
from geoutils.dbManager import ESWrapper
from geocode_twitter import TweetGeocoder
from embers.geocode import Geo, decode
from embers.geocode_mena import GeoMena as MENAGEO

DB = ESWrapper('geonames', 'places')
GEO = TweetGeocoder(DB)
ptrue, pfalse = 0, 0

error = open("error_colombia.txt", "w")
mGeo = MENAGEO()
eGeo = Geo()

eGeo = mGeo


def embersgeo(doc):
    msg = json.loads(doc)
    try:
        lt, ln, places, texts, enr = eGeo._normalize_payload(msg)
        true_geo = eGeo._geo_normalize(lt, ln, None, {}, None,
                                       eGeo.priority_policy)
        true_geo = {
            "city": decode(true_geo[0]),
            "country": decode(true_geo[1]),
            "admin1": decode(true_geo[2])
        }

Example #6

0

Show file

File: pipeline_1.py Project: aravindmahi/ms_thesis

def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country):
    log.info("inside preProcess")
    log.debug("fromDate-->" + fromDate.strftime("%d %b %Y"))
    log.debug("toDate-->" + toDate.strftime("%d %b %Y"))

    tweets = {}

    # output files
    tweetedFile = open(outputFolder + '/tweeted.csv', 'w')
    mentionFile = open(outputFolder + '/mentioned.csv', 'w')
    # retweetFile = open(outputFolder + '/retweet.csv', 'w')
    wordsFile = open(outputFolder + '/containsWord.csv', 'w')
    sentimentFile = open(outputFolder + '/sentiment.csv', 'w')
    tweetsFile = open(outputFolder + '/tweets.json', 'w')

    # build stop word list
    # englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')]
    # spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')]
    # stopWordList = []
    # stopWordList.extend(englishStopWords)
    # stopWordList.extend(spanishStopWords)

    log.info("# of keywords: " + str(len(keywordList)))
    log.info("tracking--> " + str(keywordList))
    # build regular expression for keyword
    keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList),
                              flags=re.IGNORECASE)
    # for geocoding tweets
    geo = Geo()

    tweetCount, tweetErrorCount = 0, 0

    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')
        if (fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                # prereProcess the tweet
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                text = re.sub(' +', ' ', text)  # remove multiple spaces

                                keywordsPresent = re.findall(keywordRegex, text)
                                keywordsPresent = list(set(keywordsPresent))
                                if len(keywordsPresent) > 0:
                                    tweetId = jsonTweet["twitter"]["id"]
                                    tweeterId = str(jsonTweet["interaction"]["author"]["id"])
                                    mentions = getInteractions(jsonTweet)
                                    sentiment = getSentiment(jsonTweet)

                                    hashTags = extract_hash_tags(text)
                                    hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                    #hashTags.extend(keywordsPresent)
                                    if len(hashTags) == 0:
                                        continue
                                    hashTags = list(set(hashTags).union(set(keywordsPresent)))

                                    tweetedFile.write(tweeterId + ',' + tweetId + '\n')
                                    sentimentFile.write(tweetId + ',' + str(sentiment) + '\n')
                                    for userId in mentions:
                                        mentionFile.write(tweetId + ',' + userId + '\n')
                                        # for userId in retweets:
                                        #     retweetFile.write(tweetId + ',' + userId + '\n')
                                    for hashTag in hashTags:
                                        if hashTag.startswith('#'):
                                            hashTag = hashTag[1:]
                                        wordsFile.write(tweetId + ',' + hashTag + '\n')
                                        # tracking the tweets for checks.
                                    if tweeterId in tweets:
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]
                                    else:
                                        tweets[tweeterId] = {}
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]

                                    tweetCount += 1
                        except Exception, f:
                            log.debug("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
        else:
            log.debug("skipping file %s" % (_file))

Example #7

0

Show file

File: dqe_baseline.py Project: andytwigg/ms_thesis

def trackTweets(tweetFolder, vocab, fromDate, toDate, country, threshold):
    counts = {}
    regex = {}
    totalWords = 0
    # building regex for each group
    for group in vocab:
        counts[group] = {}
        sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True)
        words = []
        if len(sorted_tuples) <= 20:
            threshold = len(sorted_tuples)
        else:
            threshold = int(len(sorted_tuples) * threshold // 100)
        for (word, weight) in sorted_tuples[:threshold]:
            words.append(word)
            totalWords += 1
        regex[group] = re.compile(r"\b%s\b" % "\\b|\\b".join(words), flags=re.IGNORECASE)

    log.info("tracking total of %d words" % totalWords)
    # for geoCoding tweets
    geo = Geo()

    tweetCount, tweetErrorCount = 0, 0
    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d")
        if fileDate >= fromDate and fileDate < toDate:
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet["interaction"]["created_at"][5:16]
                            tweetDate = datetime.strptime(dateStr, "%d %b %Y")
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                # prereProcess the tweet
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, " ", text)  # remove urls
                                text = re.sub(
                                    "[^A-Za-z_@#0-9]", " ", normalize_str(text, lower=True)
                                )  # allow only alphaNumerics and twitter tags
                                text = re.sub(" +", " ", text)  # remove multiple spaces
                                for group in regex:
                                    keywordsPresent = re.findall(regex[group], text)
                                    if len(keywordsPresent) > 0:
                                        keywordsPresent = list(set(keywordsPresent))
                                        hashTags = extract_hash_tags(text)
                                        hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                        hashTags.extend(keywordsPresent)
                                        for hashTag in hashTags:
                                            if hashTag.startswith("#"):
                                                hashTag = hashTag[1:]
                                            if hashTag in counts[group]:
                                                counts[group][hashTag] += 1
                                            else:
                                                counts[group][hashTag] = 1
                                        tweetCount += 1
                        except Exception, f:
                            log.debug("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
        else:
            log.debug("skipping file %s" % (_file))

Example #8

0

Show file

File: pipeline_2.py Project: andytwigg/ms_thesis

def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country, filesProcessed):
    log.info("inside preProcess")
    log.debug("fromDate-->" + fromDate.strftime("%d %b %Y"))
    log.debug("toDate-->" + toDate.strftime("%d %b %Y"))

    tweetCount, tweetErrorCount = 0, 0
    tweets = {}

    # output files
    tweetedFile = open(outputFolder + '/tweeted.csv', 'w')
    mentionFile = open(outputFolder + '/mentioned.csv', 'w')
    retweetFile = open(outputFolder + '/retweet.csv', 'w')
    wordsFile = open(outputFolder + '/containsWord.csv', 'w')
    sentimentFile = open(outputFolder + '/sentiment.csv', 'w')
    tweetsFile = open(outputFolder + '/tweets.json', 'w')

    # build stop word list
    englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')]
    spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')]
    stopWordList = []
    stopWordList.extend(englishStopWords)
    stopWordList.extend(spanishStopWords)

    log.info("# of keywords: " + str(len(keywordList)))
    log.info("tracking--> " + str(keywordList))
    # build regular expression for keyword
    keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList),
                              flags=re.IGNORECASE)

    # for geocoding tweets
    geo = Geo()

    log.info("filesProcessed-->" + str(filesProcessed))
    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')

        if (_file not in filesProcessed and fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    tweetCount, tweetErrorCount = 0, 0
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            sentiment = getSentiment(jsonTweet)
                            if sentiment == 0:
                                continue
                            geoList = geo.geo_normalize(jsonTweet)
                            ctry, a1, a2, a3 = geoList[1:5]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                text = re.sub(' +', ' ', text)  # remove multiple spaces

                                keywordsPresent = keywordRegex.search(text)
                                if keywordsPresent is not None:
                                    words = text.split(" ")
                                    words = [w for w in words if len(w) > 2 and w not in stopWordList]
                                    words2 = []
                                    for word in words:
                                        for w in word:
                                            if (word not in keywordList) and (w.isdigit() or w == '@'):
                                                break
                                        else:
                                            if word[0] == '#':
                                                word = word[1:]
                                            words2.append(word)

                                    tweetId = jsonTweet["twitter"]["id"]
                                    tweeterId = str(jsonTweet["interaction"]["author"]["id"])
                                    mentions, retweets = getInteractions(jsonTweet)

                                    tweetedFile.write(tweeterId + ',' + tweetId + '\n')
                                    sentimentFile.write(tweetId + ',' + str(sentiment) + '\n')
                                    for userId in mentions:
                                        mentionFile.write(tweetId + ',' + userId + '\n')
                                    for userId in retweets:
                                        retweetFile.write(tweetId + ',' + userId + '\n')
                                    for word in words2:
                                        wordsFile.write(tweetId + ',' + word + '\n')
                                    # tracking the tweets for checks.
                                    if tweeterId in tweets:
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]
                                    else:
                                        tweets[tweeterId] = {}
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]

                                    tweetCount += 1
                        except Exception, f:
                            log.exception("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
            log.info("tweets used: %s" % str(tweetCount))
            log.debug("tweetErrorCount : %s" % str(tweetErrorCount))
            filesProcessed.append(_file)
            break
        else:
            log.debug("skipping file %s" % (_file))

Example #9

0

Show file

File: countTweets.py Project: andytwigg/ms_thesis

    sys.path.insert(1, path)
from embers.geocode import Geo
from etool import args, logs
from datetime import datetime
log = logs.getLogger(__processor__)

if __name__ == "__main__":
    ap = args.get_parser()
    ap.add_argument('-t', '--tweetFolder', type=str,
                    help='inputFolder pointing to PSLs output',
                    default='/hdd/tweets/2012/oct')
    ap.add_argument('-c', '--country', type=str)
    ap.add_argument('-m', '--month', type=str)
    arg = ap.parse_args()
    logs.init(arg)
    geo = Geo()
    tweetCount = 0
    date = datetime.strptime(arg.month, "%b %Y")
    for _file in os.listdir(arg.tweetFolder):
        try:
            with open(arg.tweetFolder + "/" + _file, "r") as FILE:
                for line in FILE:
                    try:
                        jsonTweet = json.loads(line.strip())
                        dateStr = jsonTweet['interaction']['created_at'][5:16]
                        tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                        geoList = geo.geo_normalize(jsonTweet)
                        city, ctry, state = geoList[:3]
                        if ctry and ctry.lower() == arg.country.lower() and date.month == tweetDate.month and date.year == tweetDate.year:
                            tweetCount += 1
                    except Exception, f:

Example #10

0

Show file

File: warningUtils.py Project: sauravcsvt/python_utils

class WarningParser(object):
    def __init__(self, transforms=None, createCols=None, complex_fns={}):
        self.et_cls = ("011", "012", "013", "014", "015", "016")
        self.pop_cls = (u'Business', u'Media', u'Medical', u'Legal', u'General Population',
                        u'Refugees/Displaced', u'Ethnic', u'Agricultural', u'Labor', u'Religious',
                        u'Education')
        self.viol_cls = ("1", "2")
        if transforms:
            self.transforms = transforms
        else:
            self.transforms = {'eventDate': pd.to_datetime(lambda x: parse(x[:10])), 'date': pd.to_datetime(lambda x: parse(x[:10]))}

        if createCols:
            self.createCols = createCols
        else:
            self.createCols = {'month': {'transformCol': 'eventDate', 'transformFn': lambda x: x[:7]},
                               'country': {'transformCol': 'location', 'transformFn': lambda x: x[0]},
                               'model_short': {'transformCol': 'model', 'transformFn': self.get_model},
                               'eventcode': {'transformCol': 'eventType', 'transformFn': lambda x: x[:3]}}

        if complex_fns:
            self.complex_fns = complex_fns
        else:
            self.complex_fns = {"reportingDelay": lambda x: (x['date'] - x['eventDate']) / np.timedelta64(1, 'D')}

        self.seen = shelve.open('persistent_shelve.db')
        self.embersgeo = None

    def parse(self, gsrObj, geo=False):
        if geo:
            self.embersgeo = Geo()

        if isinstance(gsrObj, file):
            gsr = [self._formatcheck(json.loads(l), geo) for l in gsrObj if l.strip()]

        elif isinstance(gsrObj, basestring):
            with open(gsrObj) as gfile:
                gsr = [self._formatcheck(json.loads(l), geo) for l in gfile if l.strip()]

        elif isinstance(gsrObj, list):
            gsr = [self._formatcheck(j, geo) for j in gsrObj]

        else:
            raise NotImplementedError

        gsr_df = self._dfmap(gsr)
        return gsr_df

    def _dfmap(self, gsr):
        return DataFrame_mod(gsr).multiapply(applyfns=self.transforms, newCols=self.createCols, complexfns=self.complex_fns)

    def _formatcheck(self, j, geo=False):
        if "classification" not in j:
            try:
                if len(j["eventType"]) < 4:
                    j["eventType"] += "1"
                j["classification"] = {"eventType": {k: 0.0 for k in self.et_cls},
                                       "population": {k: 0.0 for k in self.pop_cls},
                                       "violence": {k: 0.0 for k in self.viol_cls}
                                       }

                j["classification"]["eventType"][j["eventType"][:3]] = 1.0
                j["classification"]["violence"][j["eventType"][3]] = 1.0
                j["classification"]["population"][j["population"]] = 1.0
            except:
                pass
        if isinstance(j["classification"], basestring):
            j["classification"] = json.loads(j["classification"])

        if isinstance(j["classification"], list):
            j["classification"] = j["classification"][0]

        if "matched_gsr" in j and isinstance(j["matched_gsr"], basestring):
            j["matched_gsr"] = json.loads(j["matched_gsr"])

        if "match_score" in j and isinstance(j["match_score"], basestring):
            j["match_score"] = json.loads(j["match_score"])

        #all empty location fields have to be expressed using '-' instead of ''
        if 'location' in j:
            for l in j['location']:
                if l.strip() == '':
                    l = '-'

        if geo:
            j['locInfo'] = self.get_locInfo(j['location'])
            j['coordinates'] = [j['locInfo']['latitude'], j['locInfo']['longitude']] if j['locInfo'] else [None, None]
        return j

    def is_citylevel(self, loc):
        """Determine if city level info is present in location"""
        if loc[2] != "" and loc[2] != "-":
            return True

        return False

    def get_locInfo(self, loctuple, canonicalLT=None):
        lstr = encode(','.join(loctuple))
        if lstr in self.seen:
            return self.seen[lstr]

        loc_headers = ("city", "country", "state", "admin2", "admin3", "population_size",
                       "latitude", "longitude", "id", "freq")
        if not self.is_citylevel(loctuple):
            return None

        if not canonicalLT:
            co, st, ci = reverse_osicorrection(loctuple)
            #if not self.is_citylevel(loctuple):
            #    canonicalLT, aliasLT = self.embersgeo.best_guess(ci, co, st)
            canonicalLT, aliasLT = self.embersgeo.best_guess_city(ci, co, st)

        if canonicalLT:
            try:
                canonicalLT_corrected = [decode(l) for l in canonicalLT[0]]
                canonicalLT[0] = canonicalLT_corrected
            except:
                pass
            ldict = dict(zip(loc_headers, canonicalLT[0]))
            self.seen[lstr] = ldict
            return ldict

        if aliasLT:
            try:
                aliasLT_corrected = [decode(l) for l in aliasLT[0]]
                aliasLT[0] = aliasLT_corrected
            except:
                pass

            ldict = dict(zip(loc_headers, aliasLT[0]))
            self.seen[lstr] = ldict
            return ldict

        return None

    def get_model(self, name):
        """get model _short name"""
        nl = name.lower()
        if "dynamic query" in nl:
            return "dqe"

        if "LASSO" in name:
            return "lasso"

        if "baserate" in nl:
            return "br"

        if "planned" in nl:
            return "pp"

        if "mle" in nl:
            return "mle"

        if "civil unrest fast-spatial-scan" in nl:
            return "ss"

        if "locReWriter" in nl:
            return "lw"

        return nl