Example #1
0
def analyze_recent(tweet_file_path, analyze_points, models, es_url=None):
    if es_url == None:
        es = elasticsearch.Elasticsearch()
    else:
        es = elasticsearch.Elasticsearch([es_url])

    files = sorted(
        os.listdir(tweet_file_path),
        key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime)
    for file in files:
        if file in ["analyzed", "live_stream"]:
            continue
        print "analyzing file:", file
        d0 = open(tweet_file_path + "/" + file)
        for line in d0:
            sr = ScoreRecord(line)
            sr.write_to_es("jag_geqestream_documents", "post", es)
        os.rename(tweet_file_path + "/" + file,
                  tweet_file_path + "/analyzed/" + file)

    query = {
        "filter": {
            "bool": {
                "must": [{
                    "range": {
                        "post_date": {
                            "gte": "now-1h"
                        }
                    }
                }]
            }
        }
    }
    n_hits = es.search(index="jag_geqestream_documents",
                       doc_type="post",
                       body=query,
                       search_type="count")['hits']['total']
    scanResp = es.search(index="jag_geqestream_documents",
                         doc_type="post",
                         body=query,
                         search_type="scan",
                         scroll="10m")
    scrollId = scanResp['_scroll_id']
    response = es.scroll(scroll_id=scrollId, scroll="10m")
    bins = {}
    print "\tAnalyzing", n_hits, "hits"
    while n_hits > 0:
        n_hits = n_hits - len(response["hits"]["hits"])
        for hit in response["hits"]["hits"]:
            sr = ScoreRecord(hit, 1)
            k = rec_to_key(sr)
            if k in bins.keys():
                bins[k].add_record(sr)
            else:
                bins[k] = ScoreBin(sr)
        if n_hits > 0:
            response = es.scroll(scroll_id=scrollId, scroll="10m")

    full_bins = filter(lambda x: x.bin_size() > 5, bins.values())
    print "\tScoring", len(full_bins), "bins"
    for fb in full_bins:
        for k, v in models.iteritems():
            fb.apply_model(k, v)
        if len(fb.model_scores.keys()) > 0:
            write_rec = False
            for score in fb.model_scores.values():
                if score > 0.5:
                    write_rec = True
            if write_rec == True:
                fb.save_score(es, "jag_geqestream_points", "post")
Example #2
0
import sys