def parse(self, gsrObj, geo=False): if geo: self.embersgeo = Geo() if isinstance(gsrObj, file): gsr = [self._formatcheck(json.loads(l), geo) for l in gsrObj if l.strip()] elif isinstance(gsrObj, basestring): with open(gsrObj) as gfile: gsr = [self._formatcheck(json.loads(l), geo) for l in gfile if l.strip()] elif isinstance(gsrObj, list): gsr = [self._formatcheck(j, geo) for j in gsrObj] else: raise NotImplementedError gsr_df = self._dfmap(gsr) return gsr_df
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument( '--cat', action="store_true", help='Read input from standard in and write to standard out.') arg = ap.parse_args() logs.init(arg) geo_mena = GeoMena() geo_lac = Geo(geo_region=GEO_REGION.lac) try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = sys.stdout for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) geo_annotate(tweet, geo_mena, geo_lac) if tweet is not None: outs.write( json.dumps(tweet, ensure_ascii=False).encode("utf-8")) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', (entry, )) else: queue.init(arg) with queue.open(arg.sub, 'r') as inq: with queue.open(arg.pub, 'w', capture=True) as outq: for tweet in inq: try: content = geo_annotate(tweet, geo_mena, geo_lac) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', (tweet, )) return 0 except Exception as e: log.exception("Unknown error in main function-{}".format(str(e))) return 1
def collectMentions(self): geo = Geo() tweetCount = 0 tweetErrorCount = 0 totalFiles = len(os.listdir(self.inputFolder)) fileCount = 1 for _file in sorted(os.listdir(self.inputFolder)): fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d") if(fileDate > self.toDate or fileDate < self.fromDate): continue log.debug("processing file %d/%d-->%s" % (fileCount, totalFiles, _file)) fileCount += 1 try: with open(self.inputFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) geoList = geo.geo_normalize(jsonTweet) city = geoList[0] country = geoList[1] state = geoList[2] if ((self.city == '-' and self.state == '-' and country and country.lower() == self.country) or (country and country.lower() == self.country and state and state.lower() == self.state) or (country and country.lower() == self.country and state and state.lower() == self.state and city and city.lower() == self.city)): tweetCount += 1 # use [5:25] if need HH:MM:SS datestr = jsonTweet["interaction"]["created_at"][5:16] klout, sentiment = getKloutSentiment(jsonTweet) tweeterId = jsonTweet["interaction"]["author"]["id"] candidatesFound = self.processTweet(jsonTweet) self.updateScoreCard(candidatesFound, tweeterId, klout, sentiment, datestr) except Exception, f: log.exception("error processing tweets %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e)
def execute(arg): logs.init(arg) fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") tweetFolder = arg.tweetFolder country = arg.country hashTagCounts = {} uids = {} # loading twitter handles from a file with open(arg.seedFile, 'r') as _file: for line in _file: handle, candidate = line.strip().split(',') if candidate not in uids: uids[candidate] = [] hashTagCounts[candidate] = {} uids[candidate].append(handle.lower()) else: uids[candidate].append(handle.lower()) # for geolocation geo = Geo() for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): userId, realName = None, None if 'twiiter' in jsonTweet: if 'user' in jsonTweet['twitter']: if 'screen_name' in jsonTweet['twitter']['user']: userId = jsonTweet['twitter']['user']['screen_name'].lower() if 'name' in jsonTweet['twitter']['user']: realName = jsonTweet['twitter']['user']['name'].lower() if userId is None and realName is None: continue log.debug('userId or realName is not None') candidate = getCandidate(userId, realName, uids) if candidate is not None: log.debug('found candidate--> ' + candidate) # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] for hashTag in hashTags: if hashTag.startswith('#'): hashTag = hashTag[1:] if hashTag in hashTagCounts[candidate]: hashTagCounts[candidate][hashTag] += 1 else: hashTagCounts[candidate][hashTag] = 1 except Exception, e: log.exception('error processing tweet %s' % e) except Exception, f: log.exception('error processing file %s' % f) else: log.debug('skipping file %s ' % _file)
__version__ = "0.0.1" import json import gzip from geoutils.dbManager import ESWrapper from geocode_twitter import TweetGeocoder from embers.geocode import Geo, decode from embers.geocode_mena import GeoMena as MENAGEO DB = ESWrapper('geonames', 'places') GEO = TweetGeocoder(DB) ptrue, pfalse = 0, 0 error = open("error_colombia.txt", "w") mGeo = MENAGEO() eGeo = Geo() eGeo = mGeo def embersgeo(doc): msg = json.loads(doc) try: lt, ln, places, texts, enr = eGeo._normalize_payload(msg) true_geo = eGeo._geo_normalize(lt, ln, None, {}, None, eGeo.priority_policy) true_geo = { "city": decode(true_geo[0]), "country": decode(true_geo[1]), "admin1": decode(true_geo[2]) }
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country): log.info("inside preProcess") log.debug("fromDate-->" + fromDate.strftime("%d %b %Y")) log.debug("toDate-->" + toDate.strftime("%d %b %Y")) tweets = {} # output files tweetedFile = open(outputFolder + '/tweeted.csv', 'w') mentionFile = open(outputFolder + '/mentioned.csv', 'w') # retweetFile = open(outputFolder + '/retweet.csv', 'w') wordsFile = open(outputFolder + '/containsWord.csv', 'w') sentimentFile = open(outputFolder + '/sentiment.csv', 'w') tweetsFile = open(outputFolder + '/tweets.json', 'w') # build stop word list # englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')] # spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')] # stopWordList = [] # stopWordList.extend(englishStopWords) # stopWordList.extend(spanishStopWords) log.info("# of keywords: " + str(len(keywordList))) log.info("tracking--> " + str(keywordList)) # build regular expression for keyword keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList), flags=re.IGNORECASE) # for geocoding tweets geo = Geo() tweetCount, tweetErrorCount = 0, 0 for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces keywordsPresent = re.findall(keywordRegex, text) keywordsPresent = list(set(keywordsPresent)) if len(keywordsPresent) > 0: tweetId = jsonTweet["twitter"]["id"] tweeterId = str(jsonTweet["interaction"]["author"]["id"]) mentions = getInteractions(jsonTweet) sentiment = getSentiment(jsonTweet) hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] #hashTags.extend(keywordsPresent) if len(hashTags) == 0: continue hashTags = list(set(hashTags).union(set(keywordsPresent))) tweetedFile.write(tweeterId + ',' + tweetId + '\n') sentimentFile.write(tweetId + ',' + str(sentiment) + '\n') for userId in mentions: mentionFile.write(tweetId + ',' + userId + '\n') # for userId in retweets: # retweetFile.write(tweetId + ',' + userId + '\n') for hashTag in hashTags: if hashTag.startswith('#'): hashTag = hashTag[1:] wordsFile.write(tweetId + ',' + hashTag + '\n') # tracking the tweets for checks. if tweeterId in tweets: tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] else: tweets[tweeterId] = {} tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] tweetCount += 1 except Exception, f: log.debug("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) else: log.debug("skipping file %s" % (_file))
def trackTweets(tweetFolder, vocab, fromDate, toDate, country, threshold): counts = {} regex = {} totalWords = 0 # building regex for each group for group in vocab: counts[group] = {} sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True) words = [] if len(sorted_tuples) <= 20: threshold = len(sorted_tuples) else: threshold = int(len(sorted_tuples) * threshold // 100) for (word, weight) in sorted_tuples[:threshold]: words.append(word) totalWords += 1 regex[group] = re.compile(r"\b%s\b" % "\\b|\\b".join(words), flags=re.IGNORECASE) log.info("tracking total of %d words" % totalWords) # for geoCoding tweets geo = Geo() tweetCount, tweetErrorCount = 0, 0 for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d") if fileDate >= fromDate and fileDate < toDate: log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet["interaction"]["created_at"][5:16] tweetDate = datetime.strptime(dateStr, "%d %b %Y") geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, " ", text) # remove urls text = re.sub( "[^A-Za-z_@#0-9]", " ", normalize_str(text, lower=True) ) # allow only alphaNumerics and twitter tags text = re.sub(" +", " ", text) # remove multiple spaces for group in regex: keywordsPresent = re.findall(regex[group], text) if len(keywordsPresent) > 0: keywordsPresent = list(set(keywordsPresent)) hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] hashTags.extend(keywordsPresent) for hashTag in hashTags: if hashTag.startswith("#"): hashTag = hashTag[1:] if hashTag in counts[group]: counts[group][hashTag] += 1 else: counts[group][hashTag] = 1 tweetCount += 1 except Exception, f: log.debug("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) else: log.debug("skipping file %s" % (_file))
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country, filesProcessed): log.info("inside preProcess") log.debug("fromDate-->" + fromDate.strftime("%d %b %Y")) log.debug("toDate-->" + toDate.strftime("%d %b %Y")) tweetCount, tweetErrorCount = 0, 0 tweets = {} # output files tweetedFile = open(outputFolder + '/tweeted.csv', 'w') mentionFile = open(outputFolder + '/mentioned.csv', 'w') retweetFile = open(outputFolder + '/retweet.csv', 'w') wordsFile = open(outputFolder + '/containsWord.csv', 'w') sentimentFile = open(outputFolder + '/sentiment.csv', 'w') tweetsFile = open(outputFolder + '/tweets.json', 'w') # build stop word list englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')] spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')] stopWordList = [] stopWordList.extend(englishStopWords) stopWordList.extend(spanishStopWords) log.info("# of keywords: " + str(len(keywordList))) log.info("tracking--> " + str(keywordList)) # build regular expression for keyword keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList), flags=re.IGNORECASE) # for geocoding tweets geo = Geo() log.info("filesProcessed-->" + str(filesProcessed)) for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (_file not in filesProcessed and fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: tweetCount, tweetErrorCount = 0, 0 for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') sentiment = getSentiment(jsonTweet) if sentiment == 0: continue geoList = geo.geo_normalize(jsonTweet) ctry, a1, a2, a3 = geoList[1:5] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces keywordsPresent = keywordRegex.search(text) if keywordsPresent is not None: words = text.split(" ") words = [w for w in words if len(w) > 2 and w not in stopWordList] words2 = [] for word in words: for w in word: if (word not in keywordList) and (w.isdigit() or w == '@'): break else: if word[0] == '#': word = word[1:] words2.append(word) tweetId = jsonTweet["twitter"]["id"] tweeterId = str(jsonTweet["interaction"]["author"]["id"]) mentions, retweets = getInteractions(jsonTweet) tweetedFile.write(tweeterId + ',' + tweetId + '\n') sentimentFile.write(tweetId + ',' + str(sentiment) + '\n') for userId in mentions: mentionFile.write(tweetId + ',' + userId + '\n') for userId in retweets: retweetFile.write(tweetId + ',' + userId + '\n') for word in words2: wordsFile.write(tweetId + ',' + word + '\n') # tracking the tweets for checks. if tweeterId in tweets: tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] else: tweets[tweeterId] = {} tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] tweetCount += 1 except Exception, f: log.exception("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) log.info("tweets used: %s" % str(tweetCount)) log.debug("tweetErrorCount : %s" % str(tweetErrorCount)) filesProcessed.append(_file) break else: log.debug("skipping file %s" % (_file))
sys.path.insert(1, path) from embers.geocode import Geo from etool import args, logs from datetime import datetime log = logs.getLogger(__processor__) if __name__ == "__main__": ap = args.get_parser() ap.add_argument('-t', '--tweetFolder', type=str, help='inputFolder pointing to PSLs output', default='/hdd/tweets/2012/oct') ap.add_argument('-c', '--country', type=str) ap.add_argument('-m', '--month', type=str) arg = ap.parse_args() logs.init(arg) geo = Geo() tweetCount = 0 date = datetime.strptime(arg.month, "%b %Y") for _file in os.listdir(arg.tweetFolder): try: with open(arg.tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and ctry.lower() == arg.country.lower() and date.month == tweetDate.month and date.year == tweetDate.year: tweetCount += 1 except Exception, f:
class WarningParser(object): def __init__(self, transforms=None, createCols=None, complex_fns={}): self.et_cls = ("011", "012", "013", "014", "015", "016") self.pop_cls = (u'Business', u'Media', u'Medical', u'Legal', u'General Population', u'Refugees/Displaced', u'Ethnic', u'Agricultural', u'Labor', u'Religious', u'Education') self.viol_cls = ("1", "2") if transforms: self.transforms = transforms else: self.transforms = {'eventDate': pd.to_datetime(lambda x: parse(x[:10])), 'date': pd.to_datetime(lambda x: parse(x[:10]))} if createCols: self.createCols = createCols else: self.createCols = {'month': {'transformCol': 'eventDate', 'transformFn': lambda x: x[:7]}, 'country': {'transformCol': 'location', 'transformFn': lambda x: x[0]}, 'model_short': {'transformCol': 'model', 'transformFn': self.get_model}, 'eventcode': {'transformCol': 'eventType', 'transformFn': lambda x: x[:3]}} if complex_fns: self.complex_fns = complex_fns else: self.complex_fns = {"reportingDelay": lambda x: (x['date'] - x['eventDate']) / np.timedelta64(1, 'D')} self.seen = shelve.open('persistent_shelve.db') self.embersgeo = None def parse(self, gsrObj, geo=False): if geo: self.embersgeo = Geo() if isinstance(gsrObj, file): gsr = [self._formatcheck(json.loads(l), geo) for l in gsrObj if l.strip()] elif isinstance(gsrObj, basestring): with open(gsrObj) as gfile: gsr = [self._formatcheck(json.loads(l), geo) for l in gfile if l.strip()] elif isinstance(gsrObj, list): gsr = [self._formatcheck(j, geo) for j in gsrObj] else: raise NotImplementedError gsr_df = self._dfmap(gsr) return gsr_df def _dfmap(self, gsr): return DataFrame_mod(gsr).multiapply(applyfns=self.transforms, newCols=self.createCols, complexfns=self.complex_fns) def _formatcheck(self, j, geo=False): if "classification" not in j: try: if len(j["eventType"]) < 4: j["eventType"] += "1" j["classification"] = {"eventType": {k: 0.0 for k in self.et_cls}, "population": {k: 0.0 for k in self.pop_cls}, "violence": {k: 0.0 for k in self.viol_cls} } j["classification"]["eventType"][j["eventType"][:3]] = 1.0 j["classification"]["violence"][j["eventType"][3]] = 1.0 j["classification"]["population"][j["population"]] = 1.0 except: pass if isinstance(j["classification"], basestring): j["classification"] = json.loads(j["classification"]) if isinstance(j["classification"], list): j["classification"] = j["classification"][0] if "matched_gsr" in j and isinstance(j["matched_gsr"], basestring): j["matched_gsr"] = json.loads(j["matched_gsr"]) if "match_score" in j and isinstance(j["match_score"], basestring): j["match_score"] = json.loads(j["match_score"]) #all empty location fields have to be expressed using '-' instead of '' if 'location' in j: for l in j['location']: if l.strip() == '': l = '-' if geo: j['locInfo'] = self.get_locInfo(j['location']) j['coordinates'] = [j['locInfo']['latitude'], j['locInfo']['longitude']] if j['locInfo'] else [None, None] return j def is_citylevel(self, loc): """Determine if city level info is present in location""" if loc[2] != "" and loc[2] != "-": return True return False def get_locInfo(self, loctuple, canonicalLT=None): lstr = encode(','.join(loctuple)) if lstr in self.seen: return self.seen[lstr] loc_headers = ("city", "country", "state", "admin2", "admin3", "population_size", "latitude", "longitude", "id", "freq") if not self.is_citylevel(loctuple): return None if not canonicalLT: co, st, ci = reverse_osicorrection(loctuple) #if not self.is_citylevel(loctuple): # canonicalLT, aliasLT = self.embersgeo.best_guess(ci, co, st) canonicalLT, aliasLT = self.embersgeo.best_guess_city(ci, co, st) if canonicalLT: try: canonicalLT_corrected = [decode(l) for l in canonicalLT[0]] canonicalLT[0] = canonicalLT_corrected except: pass ldict = dict(zip(loc_headers, canonicalLT[0])) self.seen[lstr] = ldict return ldict if aliasLT: try: aliasLT_corrected = [decode(l) for l in aliasLT[0]] aliasLT[0] = aliasLT_corrected except: pass ldict = dict(zip(loc_headers, aliasLT[0])) self.seen[lstr] = ldict return ldict return None def get_model(self, name): """get model _short name""" nl = name.lower() if "dynamic query" in nl: return "dqe" if "LASSO" in name: return "lasso" if "baserate" in nl: return "br" if "planned" in nl: return "pp" if "mle" in nl: return "mle" if "civil unrest fast-spatial-scan" in nl: return "ss" if "locReWriter" in nl: return "lw" return nl