def get_duration(ndb): dh = db.DbHandler(ndb) event_name = dh.get_info_dic()["event_name"] r = get_begin_date_wikidata(event_name) senti = (datetime.date(1900, 1, 1), datetime.date(1900, 1, 1) + threemonth) dur = senti if not r == datetime.date(1900, 1, 1): dur = (r, r + threemonth) b = dh.get_info_dic()['begin_date'] b = dt.str2date(b) e = dh.get_info_dic()['end_date'] e = dt.str2date(e) if dur == senti: while b < e: if len(dh.get_ent_all_tag(b, b + threemonth, 100000)) > 0: dur = (b, b + threemonth) break b += threemonth if dur == senti: dur = (datetime.date(1987, 1, 1), datetime.date(1987, 1, 1) + threemonth) dh.insert_info('event_begin_date', dt.date2str(dur[0])) # print dt.date2str(dur[0]) # begin_event_date = dh.get_info_dic()['event_begin_date'] # print begin_event_date dh.insert_info('event_duration', dt.date2str(dur[0]) + '-' + dt.date2str(dur[1])) print "duration of ", ndb, " is ", dur[0], "-", dur[1] return dur
def get_granurality_list(ndb): # Connect to DB and create table dh = db.DbHandler(ndb) begin_event_date = dh.get_info_dic()['event_begin_date'] begin_event_date = dt.str2date(begin_event_date) begin_date = dh.get_info_dic()['begin_date'] begin_date = dt.str2date(begin_date) end_date = dh.get_info_dic()['end_date'] end_date = dt.str2date(end_date) def make_out_data(get_data_func): threemonth = datetime.timedelta(days=92) dout = ["period"] dout.append("xxxx-xx-xx") dout.append("xxxx-xx") dout.append("xxxx") dout.append("PRESENT_REF") dout.append('xxxx-SU/SP/FA/WN') dout.append("PAST_REF") for i in range(-6, 0): dout.append("normalized_" + dout[-i]) dout = [dout] # make all_dict b = begin_date e = end_date while b < e: p = get_data_func(b, b + threemonth) p = map(lambda x: x[0], p) d = len(filter(is_date, p)) m = len(filter(is_mounth, p)) y = len(filter(is_year, p)) n = len(filter(is_present, p)) s = len(filter(is_season, p)) o = len(filter(is_past, p)) dout.append([dt.date2str(b), d, m, y, n, s, o, 0, 0, 0, 0, 0, 0]) b += threemonth for i in range(-6, 0): maxv = 0 for j in range(1, len(dout)): maxv = max(maxv, dout[j][i - 6]) if maxv != 0: for j in range(1, len(dout)): dout[j][i] = dout[j][i - 6] / float(maxv) return dout return make_out_data(lambda b, e: dh.get_ent(b, e, 'HEIDEL_TIME', 1000000000))
def get_sheet(ndb, tag): print "get_top_entities_list.py-get_sheet-get_top_entities_list" d1 = get_top_entities_list(ndb, tag) # CATION # d1 = list() print "Connect to DB and create table" dh = db.DbHandler(ndb) # begin_event_date = dh.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dh.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) # begin_date = dh.get_info_dic()['begin_date'] # begin_date = dt.str2date(begin_date) # end_date = dh.get_info_dic()['end_date'] # end_date = dt.str2date(end_date) get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000)) print "get_sheet-changing_freqeuency_word" d2 = changing_freqeuency_word(begin_event_date, end_event_date, get_data_func) for d in d2: d1.append(d) d3 = score_word(begin_event_date, end_event_date, get_data_func) for d in d3: d1.append(d) return d1
def summarize_tf_idf_sheet(list_data): data0 = list_data[0][0] H = len(data0) W = len(data0[0]) nonzero_starts = [] for (data, dur) in list_data: for i in range(1, H): if dur[0] <= dt.str2date(data0[i][0]): nonzero_starts.append(i) break retdata = [] for i in range(H): l = [] for j in range(W): if i == 0 or j == 0: l.append(list_data[0][i][j]) else: a = 0 n = 0 for (k, (d, dur)) in enumerate(list_data): if i + nonzero_starts[k] < H: a += d[i][j] n = n + 1 a /= float(n) l.append(a) retdata.append(l) return retdata
def summarize_article_for_each_query(list_data): periods = [] naverages = [] list_narts = [] nonzero_starts = [] for i in range(1, len(list_data[0][0][0])): periods.append(list_data[0][0][0][i]) for (data, dur) in list_data: list_narts.append([]) nonzero = len(data[0]) - 2 s = 0 for i in range(1, len(data[0])): na = 0 for j in range(1, len(data)): na += data[j][i] a = data[0][i].split("_")[0] b = dur[0] if type(b) == datetime.datetime: b = b.date() if b <= dt.str2date(a): nonzero = min(i, nonzero) list_narts[-1].append(na) s = s + na # This is a process of normalization for i in range(len(list_narts[-1])): list_narts[-1][i] = float(list_narts[-1][i]) / float(s) nonzero_starts.append(nonzero) # print nonzero_starts for i in range(len(list_narts[0])): s = 0 n = 0 for j in range(len(list_narts)): # maneuver on index is shifting if nonzero_starts[j] + i < len(list_narts[j]): s += list_narts[j][nonzero_starts[j] + i] n += 1 if not n == 0: naverages.append(float(s) / float(n)) else: naverages.append(0) retdata = [] for (p, n) in zip(periods, naverages): retdata.append([p, n]) # print periods # print naverages return retdata
def peak_period_mention(ndb): dcon = db.DbHandler(ndb) # b = dt.str2date(dcon.get_info_dic()["event_begin_date"]) b = BEGIN_DATE e = dt.str2date(dcon.get_info_dic()["end_date"]) threemonth = datetime.timedelta(days=92) mm = 0 ret = b while b < e: m = dcon.get_ref_num(b, b + threemonth) if mm < m: mm = m ret = b b += threemonth return ret
def get_entity_count(fdb): # Connect to DB and create table dcon = db.DbHandler(fdb) # begin_event_date = dcon.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dcon.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) threemonth = datetime.timedelta(days=92) b = begin_event_date e = end_event_date output = [[ "period", "#mention", "#person(all)", "#person(new)", "#person(unique)", "#location(all)", "#location(new)", "#location(unique)", "#organization(all)", "#organization(new)", "#organization(unique)" ]] ps = set() ls = set() os = set() while b < e: d = dt.date2str(b) m = dcon.get_ref_num(b, b + threemonth) p = dcon.get_ent(b, b + threemonth, "PERSON", 100000000) l = dcon.get_ent(b, b + threemonth, "LOCATION", 100000000) o = dcon.get_ent(b, b + threemonth, "ORGANIZATION", 100000000) nps = ps.union(set(p)) nls = ls.union(set(l)) nos = os.union(set(o)) output.append([ d, m, len(p), len(nps) - len(ps), len(set(p)), len(l), len(nls) - len(ls), len(set(l)), len(o), len(nos) - len(os), len(set(o)) ]) ps = nps ls = nls os = nos b += threemonth return output
def summarize_entity_count(list_data): retdata = copy.deepcopy(list_data[0][0]) # print retdata for i in range(1, len(retdata)): for j in range(1, len(retdata[i])): retdata[i][j] = 0 nonzero_starts = [-1 for d in list_data] # print list_data[0][1] # print list_data[1][1] for (k, (d, dur)) in enumerate(list_data): for i in range(1, len(d)): a = dur[0] if type(a) == datetime.datetime: a = a.date() # print a, d[i][0] if nonzero_starts[k] == -1 and a <= dt.str2date(d[i][0]): nonzero_starts[k] = i # This is the process of normalization for j in range(1, len(d[i])): s = 0.0 for i in range(1, len(d)): s += float(d[i][j]) for i in range(1, len(d)): if not s == 0.0: d[i][j] = float(d[i][j]) / float(s) for i in range(1, len(retdata)): for j in range(1, len(retdata[i])): s = 0 n = 0 for (k, (d, dur)) in enumerate(list_data): if nonzero_starts[k] + i < len(retdata): # print i, nonzero_starts[k] + j # maneuver on index is shifting s += float(d[nonzero_starts[k] + i][j]) n += 1 # print s if n != 0: retdata[i][j] = float(s) / float(n) # print retdata return retdata
def summarize_top_enetities_list(list_data): W = len(list_data[0][0][0]) H = len(list_data[0][0]) data0 = list_data[0][0] retdata = list() l = [] for j in [0] + range(21, W): l.append(data0[0][j]) retdata.append(l) nonzero_starts = [] for (data, dur) in list_data: for i in range(1, H): if not data0[i][0].replace("-", "").isdigit(): break # print data[0] if dur[0] <= dt.str2date(data[i][0]): nonzero_starts.append(i) break for i in range(1, H): if not data0[i][0].replace("-", "").isdigit(): H = i break for i in range(0, H): l = [data0[i][0]] for j in range(21, W): a = float(0) n = 0 for (k, (d, dur)) in enumerate(list_data): if i + nonzero_starts[k] < H: a += float(d[i + nonzero_starts[k]][j]) n = n + 1 if n != 0: a /= float(n) l.append(a) retdata.append(l) return retdata
def summarize_entity_count(list_data): retdata = copy.deepcopy(list_data[0][0]) for i in range(1, len(retdata)): for j in range(1, len(retdata[i])): retdata[i][j] = 0 nonzero_starts = [-1 for d in list_data] # print list_data[0][1] # print list_data[1][1] for (k, (d, dur)) in enumerate(list_data): for i in range(1, len(d)): if dur[0] <= dt.str2date(d[i][0]): nonzero_starts[k] = i # This is the process of normalization for j in range(1, len(d[i])): s = 0.0 for i in range(1, len(d)): s += float(d[i][j]) for i in range(1, len(d)): d[0][i][j] = float(d[i][j]) / s for i in range(1, len(retdata)): for j in range(0, len(retdata[i]) - 1): s = 0 n = 0 for (k, (d, dur)) in enumerate(list_data): if nonzero_starts[k] + j < len(retdata[i]): # maneuver on index is shifting s += int(d[i][nonzero_starts[k] + j]) n += 1 if n != 0: retdata[i][j] = float(s) / float(n) return retdata
def get_top_entities_list(ndb, tag): print "get_top_entitites_list-get_top_entities_list" # Connect to DB and create table dh = db.DbHandler(ndb) # begin_event_date = dh.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dh.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) # begin_date = dh.get_info_dic()['begin_date'] # begin_date = dt.str2date(begin_date) end_date = dh.get_info_dic()['end_date'] end_date = dt.str2date(end_date) get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000)) dout = ["period"] for i in range(0, 10): dout.append("Name") dout.append("#") dout.append("one_to_before_one_cosine_sim") dout.append("one_to_all_cosine_sim") dout.append("one_to_all_before_cosine_sim") dout.append("one_to_all_future_cosine_sim") dout.append("one_to_first_three_mounth_sim") dout.append("one_to_peak_three_mounth_sim") dout.append("one_to_duration_sim") dout.append("entropy") dout.append("normalized_" + dout[-1]) dout = [dout] all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func) future_dict = dict() past_dict = dict() before_dict = dict() first_dict = dict() peak_dict = dict() dur_dict = dict() # make all_dict print "make all_dict" b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) # peak_begin = 0 # peak_end = 0 dur = duration_event.get_duration(ndb) # print "get_top_entities_list first loop" while b < e: # print "get_top_entities_list first loop", b p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) if begin_event_date <= b and begin_event_date < b + threemonth: for d in ds: if d[0] in first_dict.keys(): first_dict[d[0]] += d[1] else: first_dict[d[0]] = d[1] if b < peak_mention and peak_mention <= b + threemonth: for d in ds: peak_dict[d[0]] = d[1] if dur[0] <= b and b + threemonth <= dur[1]: for d in ds: if d[0] in dur_dict.keys(): dur_dict[d[0]] += d[1] else: dur_dict[d[0]] = d[1] b += threemonth future_dict = copy.deepcopy(all_dict) b = begin_event_date e = end_date print "get_top_entities_list second loop" while b < e: # print "get_top_entities_list second loop", b do = [dt.date2str(b)] p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) now_dict = dict() for d in ds: now_dict[d[0]] = d[1] for d in ds[0:10]: do.append(d[0]) do.append(d[1]) for i in range(0, 10 - len(ds[0:10])): do.append("") do.append("") future_dict = sub_dict(future_dict, now_dict) do.append(cosine_sim(now_dict, before_dict)) do.append(cosine_sim(now_dict, all_dict)) do.append(cosine_sim(now_dict, past_dict)) do.append(cosine_sim(now_dict, future_dict)) do.append(cosine_sim(now_dict, first_dict)) do.append(cosine_sim(now_dict, peak_dict)) do.append(cosine_sim(now_dict, dur_dict)) do.append(entropy(now_dict)) do.append(0) dout.append(do) before_dict = now_dict past_dict = add_dict(past_dict, now_dict) b += threemonth # Calculate normalized_entropy maxv = 0 for i in range(1, len(dout)): maxv = max(maxv, dout[i][column_entropy]) if maxv != 0: for i in range(1, len(dout)): dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv) return dout
def make_tf_idf_sheet(ndb): # Connect to DB and create table dh = db.DbHandler(ndb) # begin_event_date = dh.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_date = dh.get_info_dic()['end_date'] end_date = dt.str2date(end_date) def make_out_data(get_data_func): ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date) threemonth = datetime.timedelta(days=92) dout = ["period"] dout.append("tf_idf_one_to_before_one_cosine_sim") dout.append("tf_idf_one_to_all_cosine_sim") dout.append("tf_idf_one_to_all_before_cosine_sim") dout.append("tf_idf_one_to_all_future_cosine_sim") dout.append("tf_idf_one_to_first_three_mounth_sim") dout.append("tf_idf_one_to_peak_three_mounth_sim") dout.append("tf_idf_one_to_duration_sim") dout = [dout] b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) peak_begin = 0 peak_end = 0 while b < e: if b <= peak_mention and peak_mention <= b + threemonth: peak_begin = b peak_end = b + threemonth b += threemonth b = begin_event_date e = end_date dur = duration_event.get_duration(ndb) while b < e: print "tf_idf_sheet-second-loop", b do = [dt.date2str(b)] now_tiv = ti.get_tf_idf_vector(b, b + threemonth) # tf_idf_one_to_before_one_cosine_sim if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b - threemonth, b))) else: do.append(0) # tf_idf_one_to_all_cosine_sim do.append(cosine_sim(now_tiv, ti.tf_idf_all)) # tf_idf_one_to_all_before_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b)) print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(begin_event_date, b) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b))) else: do.append(0) # tf_idf_one_to_all_future_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e)) print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(b + threemonth, e) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c # print c do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e))) # tf_idf_one_to_first_three_mounth_sim do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector( begin_event_date, begin_event_date + threemonth))) # tf_idf_one_to_peak_three_mounth_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end))) # tf_idf_one_to_duration_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(dur[0], dur[1]))) dout.append(do) b += threemonth return dout data = make_out_data(lambda b, e: dh.get_ent_all_tag(b, e, 1000000000)) return data
'--lim-word', type=int, default=LIM_WORD, help='# of top words') ap.add_argument('-s', '--lim-hscore', type=int, default=LIM_HSCORE, help='# of high score mentions') ap.add_argument('-e', '--top-word', help='create "top N X sheet"') args = ap.parse_args() # Connect to DB and create table dh = db.DbHandler(args.db_file_name) event_name = dh.get_info_dic()["event_name"].replace(" ", "") event_begin_date = dt.str2date(dh.get_info_dic()["event_begin_date"]) # end_date = dt.str2date(dh.get_info_dic()["end_date"]) print 'DB: {0}'.format(args.db_file_name) if not args.xlsx_file_name: args.xlsx_file_name = args.db_file_name.split('.')[0] + '.xlsx' print 'Excel workbook: {0}'.format(args.xlsx_file_name) print '# of months / period: {0}'.format(args.month_delta) print '# of top word: {0}'.format(args.lim_word) print '# of high score mentions: {0}'.format(args.lim_hscore) print '-' * 40 # Create DB handler and xlsx workbook xh = xlsx.XlsxHandler(args.db_file_name, args.xlsx_file_name, args.month_delta)
ap.add_argument('-u', '--solr-url', default=KAREN, help='Solr URL') args = ap.parse_args() print 'DB: {0}'.format(args.db_file_name) print 'Solr URL: {0}'.format(args.solr_url) print '-' * 40 # Connect to DB file dh = db.DbHandler(args.db_file_name) # Get info print 'Getting info' info_dic = dh.get_info_dic() bdate_s = info_dic['begin_date'] edate_s = info_dic['end_date'] bdate = dt.str2date(bdate_s) edate = dt.str2date(edate_s) ev_bdate_s = info_dic['event_begin_date'] ev_edate_s = info_dic['event_end_date'] ev_bdate = dt.str2date(ev_bdate_s) ev_edate = dt.str2date(ev_edate_s) print 'article begin date: {0}'.format(bdate_s) print 'article end date: {0}'.format(edate_s) print 'event begin date: {0}'.format(ev_bdate_s) print 'event end date: {0}'.format(ev_bdate_s) print '-' * 40 # Create Solr NITF handler sh = slr.SolrNitfHandler(args.solr_url, bdate, edate) # Create tables
# print p words = list() words = zip(p["ner"], p["tokens"], p["ner"]) stop = stopwords.words("english") words = filter(lambda x: x[1] not in stop, words) words = map(lambda x: (x[0], x[1].lower(), x[2]), words) # I cannot understand what is most suitable in above line. ws = list() w = ("", "", "") for v in words: if v[0] != 'O' and v[0] == w[0]: w = (w[0], w[1] + " " + v[1], w[2]) else: ws.append(w) w = v if w[0] != "": ws.append(w) words = ws return words[1:] if __name__ == '__main__': kch = kawata_corenlp_handler() print 'kch = kawata_corenlp_handler()' p = kch.get_words(u"I lived in New York in 2016", dt.str2date(u"2016-02-12")) p = kch.get_words(u"I lived in New York in 206/09/18", dt.str2date(u"2016-02-12")) print p