def analyze_recent(tweet_file_path, analyze_points, models, es_url=None): if es_url == None: es = elasticsearch.Elasticsearch() else: es = elasticsearch.Elasticsearch([es_url]) files = sorted(os.listdir(tweet_file_path), key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime) for file in files: if file in ["analyzed", "live_stream"]: continue print "analyzing file:", file d0 = open(tweet_file_path + "/" + file) for line in d0: sr = ScoreRecord(line) sr.write_to_es("jag_geqestream_documents", "post", es) os.rename(tweet_file_path+"/"+file, tweet_file_path+"/analyzed/"+file) query = {"filter":{"bool":{"must":[{"range":{"post_date":{"gte":"now-1h"}}}]}}} n_hits = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="count")['hits']['total'] scanResp = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="scan", scroll="10m") scrollId = scanResp['_scroll_id'] response = es.scroll(scroll_id=scrollId, scroll= "10m") bins = {} print "\tAnalyzing", n_hits, "hits" while n_hits>0: n_hits = n_hits - len(response["hits"]["hits"]) for hit in response["hits"]["hits"]: sr = ScoreRecord(hit, 1) k = rec_to_key(sr) if k in bins.keys(): bins[k].add_record(sr) else: bins[k] = ScoreBin(sr) if n_hits > 0: response= es.scroll(scroll_id=scrollId, scroll= "10m") full_bins = filter(lambda x: x.bin_size()>5, bins.values()) print "\tScoring", len(full_bins), "bins" for fb in full_bins: for k, v in models.iteritems(): fb.apply_model(k, v) if len(fb.model_scores.keys()) > 0: write_rec = False for score in fb.model_scores.values(): if score > 0.5: write_rec = True if write_rec == True: fb.save_score(es, "jag_geqestream_points", "post")
def analyze_recent(tweet_file_path, analyze_points, models, es_url=None): if es_url == None: es = elasticsearch.Elasticsearch() else: es = elasticsearch.Elasticsearch([es_url]) files = sorted( os.listdir(tweet_file_path), key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime) for file in files: if file in ["analyzed", "live_stream"]: continue print "analyzing file:", file d0 = open(tweet_file_path + "/" + file) for line in d0: sr = ScoreRecord(line) sr.write_to_es("jag_geqestream_documents", "post", es) os.rename(tweet_file_path + "/" + file, tweet_file_path + "/analyzed/" + file) query = { "filter": { "bool": { "must": [{ "range": { "post_date": { "gte": "now-1h" } } }] } } } n_hits = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="count")['hits']['total'] scanResp = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="scan", scroll="10m") scrollId = scanResp['_scroll_id'] response = es.scroll(scroll_id=scrollId, scroll="10m") bins = {} print "\tAnalyzing", n_hits, "hits" while n_hits > 0: n_hits = n_hits - len(response["hits"]["hits"]) for hit in response["hits"]["hits"]: sr = ScoreRecord(hit, 1) k = rec_to_key(sr) if k in bins.keys(): bins[k].add_record(sr) else: bins[k] = ScoreBin(sr) if n_hits > 0: response = es.scroll(scroll_id=scrollId, scroll="10m") full_bins = filter(lambda x: x.bin_size() > 5, bins.values()) print "\tScoring", len(full_bins), "bins" for fb in full_bins: for k, v in models.iteritems(): fb.apply_model(k, v) if len(fb.model_scores.keys()) > 0: write_rec = False for score in fb.model_scores.values(): if score > 0.5: write_rec = True if write_rec == True: fb.save_score(es, "jag_geqestream_points", "post")
def analyze_recent(tweet_file_path, es_url=None, tag_blacklist=set()): es = None if es_url == None: es = elasticsearch.Elasticsearch() else: es = elasticsearch.Elasticsearch([es_url]) files = sorted(os.listdir(tweet_file_path), key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime) new_records = {} for file in files: if file in ["analyzed", "live_stream"]: continue print "analyzing file:", file d0 = open(tweet_file_path + "/" + file) for line in d0: sr = ScoreRecord(line) for tag in sr.tags: if tag in new_records.keys(): new_records[tag].append(sr) else: new_records[tag] = [sr] os.rename(tweet_file_path+"/"+file, tweet_file_path+"/analyzed/"+file) for tag, lst_rec in new_records.iteritems(): if tag in tag_blacklist: continue print "Getting data for tag: ", tag, count = es.count(index="jag_hc2_documents", doc_type="post", q="tags:"+tag)["count"] #test if there is enough entries for clustering n_entries = count+len(lst_rec) print " -> ", n_entries, "entries total" if n_entries<5: print "--> only", n_entries, "entries (insufficient for clustering)" for sr in lst_rec: sr.write_to_es("jag_hc2_documents","post",es) continue #associate querries with the existing hashtag list first = True tag_bin = None for entry in lst_rec: if first: first = False tag_bin = ScoreBin(record=entry, hashtag=tag) else: tag_bin.add_record(entry) if count > 0: #query ES to get previous entries with the same tags from the last 4 hours print "Query ES for tag" now = datetime.datetime.now() timewindow = now - datetime.timedelta(hours=8) res = es.search(\ index="jag_hc2_documents", \ doc_type="post", \ body={ "query": { "match":{ "tags": tag } }, "filter": { "bool":{ "must" :[ { "range": { "post_date":{ "gte" : datetime_to_es_format(timewindow), "lte" : datetime_to_es_format(now) } } } ] } } }\ ) for hit in res["hits"]["hits"]: sr = ScoreRecord(hit, data_type=1) tag_bin.add_record(sr) #perform clustering on larger list tag_bin.cluster_and_write_to_es(0.001, 10, 5, es, "jag_hc2_documents", "post", "jag_hc2_clusters", "post")
import sys