Ejemplo n.º 1
0
def analyze_recent(tweet_file_path, analyze_points, models, es_url=None):
    if es_url == None:
        es = elasticsearch.Elasticsearch()
    else:
        es = elasticsearch.Elasticsearch([es_url])

    files = sorted(os.listdir(tweet_file_path), key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime)
    for file in files:
        if file in ["analyzed", "live_stream"]:
            continue
        print "analyzing file:", file
        d0 = open(tweet_file_path + "/" + file)
        for line in d0:
            sr = ScoreRecord(line)
            sr.write_to_es("jag_geqestream_documents", "post", es)
        os.rename(tweet_file_path+"/"+file, tweet_file_path+"/analyzed/"+file)

    query = {"filter":{"bool":{"must":[{"range":{"post_date":{"gte":"now-1h"}}}]}}}
    n_hits = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="count")['hits']['total']
    scanResp = es.search(index="jag_geqestream_documents", doc_type="post", body=query, search_type="scan", scroll="10m")
    scrollId = scanResp['_scroll_id']
    response = es.scroll(scroll_id=scrollId, scroll= "10m")
    bins = {}
    print "\tAnalyzing", n_hits, "hits"
    while n_hits>0:
        n_hits = n_hits - len(response["hits"]["hits"])
        for hit in response["hits"]["hits"]:
            sr = ScoreRecord(hit, 1)
            k = rec_to_key(sr)
            if k in bins.keys():
                bins[k].add_record(sr)
            else:
                bins[k] = ScoreBin(sr)
        if n_hits > 0:
            response= es.scroll(scroll_id=scrollId, scroll= "10m")

    full_bins = filter(lambda x: x.bin_size()>5, bins.values())
    print "\tScoring", len(full_bins), "bins"
    for fb in full_bins:
        for k, v in models.iteritems():
            fb.apply_model(k, v)
        if len(fb.model_scores.keys()) > 0:
            write_rec = False
            for score in fb.model_scores.values():
                if score > 0.5:
                    write_rec = True
            if write_rec == True:
                fb.save_score(es, "jag_geqestream_points", "post")
Ejemplo n.º 2
0
def analyze_recent(tweet_file_path, analyze_points, models, es_url=None):
    if es_url == None:
        es = elasticsearch.Elasticsearch()
    else:
        es = elasticsearch.Elasticsearch([es_url])

    files = sorted(
        os.listdir(tweet_file_path),
        key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime)
    for file in files:
        if file in ["analyzed", "live_stream"]:
            continue
        print "analyzing file:", file
        d0 = open(tweet_file_path + "/" + file)
        for line in d0:
            sr = ScoreRecord(line)
            sr.write_to_es("jag_geqestream_documents", "post", es)
        os.rename(tweet_file_path + "/" + file,
                  tweet_file_path + "/analyzed/" + file)

    query = {
        "filter": {
            "bool": {
                "must": [{
                    "range": {
                        "post_date": {
                            "gte": "now-1h"
                        }
                    }
                }]
            }
        }
    }
    n_hits = es.search(index="jag_geqestream_documents",
                       doc_type="post",
                       body=query,
                       search_type="count")['hits']['total']
    scanResp = es.search(index="jag_geqestream_documents",
                         doc_type="post",
                         body=query,
                         search_type="scan",
                         scroll="10m")
    scrollId = scanResp['_scroll_id']
    response = es.scroll(scroll_id=scrollId, scroll="10m")
    bins = {}
    print "\tAnalyzing", n_hits, "hits"
    while n_hits > 0:
        n_hits = n_hits - len(response["hits"]["hits"])
        for hit in response["hits"]["hits"]:
            sr = ScoreRecord(hit, 1)
            k = rec_to_key(sr)
            if k in bins.keys():
                bins[k].add_record(sr)
            else:
                bins[k] = ScoreBin(sr)
        if n_hits > 0:
            response = es.scroll(scroll_id=scrollId, scroll="10m")

    full_bins = filter(lambda x: x.bin_size() > 5, bins.values())
    print "\tScoring", len(full_bins), "bins"
    for fb in full_bins:
        for k, v in models.iteritems():
            fb.apply_model(k, v)
        if len(fb.model_scores.keys()) > 0:
            write_rec = False
            for score in fb.model_scores.values():
                if score > 0.5:
                    write_rec = True
            if write_rec == True:
                fb.save_score(es, "jag_geqestream_points", "post")
Ejemplo n.º 3
0
def analyze_recent(tweet_file_path, es_url=None, tag_blacklist=set()):
    es = None
    if es_url == None:
        es = elasticsearch.Elasticsearch()
    else:
        es = elasticsearch.Elasticsearch([es_url])

    files = sorted(os.listdir(tweet_file_path), key=lambda x: os.stat(os.path.join(tweet_file_path, x)).st_mtime)
    new_records = {}
    for file in files:
        if file in ["analyzed", "live_stream"]:
            continue
        print "analyzing file:", file
        d0 = open(tweet_file_path + "/" + file)
        for line in d0:
            sr = ScoreRecord(line)
            for tag in sr.tags:
                if tag in new_records.keys():
                    new_records[tag].append(sr)
                else:
                    new_records[tag] = [sr]
        os.rename(tweet_file_path+"/"+file, tweet_file_path+"/analyzed/"+file)


    for tag, lst_rec in new_records.iteritems():
        if tag in tag_blacklist:
            continue
        print "Getting data for tag: ", tag,
        count = es.count(index="jag_hc2_documents", doc_type="post", q="tags:"+tag)["count"]
        #test if there is enough entries for clustering
        n_entries = count+len(lst_rec)
        print " -> ", n_entries, "entries total"
        if n_entries<5:
            print "--> only", n_entries, "entries (insufficient for clustering)"
            for sr in lst_rec:
                sr.write_to_es("jag_hc2_documents","post",es)
            continue

        #associate querries with the existing hashtag list
        first = True
        tag_bin = None
        for entry in lst_rec:
            if first:
                first = False
                tag_bin = ScoreBin(record=entry, hashtag=tag)
            else:
                tag_bin.add_record(entry)
        if count > 0:
            #query ES to get previous entries with the same tags from the last 4 hours
            print "Query ES for tag"
            now = datetime.datetime.now()
            timewindow = now - datetime.timedelta(hours=8)
            res = es.search(\
                index="jag_hc2_documents", \
                doc_type="post", \
                body={
                    "query": {
                        "match":{
                            "tags": tag
                        }
                    },
                    "filter": {
                        "bool":{
                            "must" :[
                                {
                                    "range": {
                                        "post_date":{
                                            "gte" : datetime_to_es_format(timewindow),
                                            "lte" : datetime_to_es_format(now)
                                         }
                                    }
                                }
                            ]
                        }
                    }
                }\
            )
            for hit in res["hits"]["hits"]:
                sr = ScoreRecord(hit, data_type=1)
                tag_bin.add_record(sr)

        #perform clustering on larger list
        tag_bin.cluster_and_write_to_es(0.001, 10, 5, es, "jag_hc2_documents", "post", "jag_hc2_clusters", "post")
Ejemplo n.º 4
0
import sys