Beispiel #1
0
def get_top_entities_list(ndb, tag):
    print "get_top_entitites_list-get_top_entities_list"
    # Connect to DB and create table
    dh = db.DbHandler(ndb)
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dh.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)
    # begin_date = dh.get_info_dic()['begin_date']
    # begin_date = dt.str2date(begin_date)
    end_date = dh.get_info_dic()['end_date']
    end_date = dt.str2date(end_date)

    get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000))

    dout = ["period"]
    for i in range(0, 10):
        dout.append("Name")
        dout.append("#")
    dout.append("one_to_before_one_cosine_sim")
    dout.append("one_to_all_cosine_sim")
    dout.append("one_to_all_before_cosine_sim")
    dout.append("one_to_all_future_cosine_sim")
    dout.append("one_to_first_three_mounth_sim")
    dout.append("one_to_peak_three_mounth_sim")
    dout.append("one_to_duration_sim")

    dout.append("entropy")
    dout.append("normalized_" + dout[-1])

    dout = [dout]
    all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func)
    future_dict = dict()
    past_dict = dict()
    before_dict = dict()
    first_dict = dict()
    peak_dict = dict()
    dur_dict = dict()

    # make all_dict
    print "make all_dict"
    b = begin_event_date
    e = end_date
    peak_mention = get_entity_count.peak_period_mention(ndb)
    # peak_begin = 0
    # peak_end = 0
    dur = duration_event.get_duration(ndb)

    # print "get_top_entities_list first loop"
    while b < e:
        # print "get_top_entities_list first loop", b
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        if begin_event_date <= b and begin_event_date < b + threemonth:
            for d in ds:
                if d[0] in first_dict.keys():
                    first_dict[d[0]] += d[1]
                else:
                    first_dict[d[0]] = d[1]
        if b < peak_mention and peak_mention <= b + threemonth:
            for d in ds:
                peak_dict[d[0]] = d[1]
        if dur[0] <= b and b + threemonth <= dur[1]:
            for d in ds:
                if d[0] in dur_dict.keys():
                    dur_dict[d[0]] += d[1]
                else:
                    dur_dict[d[0]] = d[1]
        b += threemonth
    future_dict = copy.deepcopy(all_dict)

    b = begin_event_date
    e = end_date
    print "get_top_entities_list second loop"
    while b < e:
        # print "get_top_entities_list second loop", b
        do = [dt.date2str(b)]
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        now_dict = dict()
        for d in ds:
            now_dict[d[0]] = d[1]
        for d in ds[0:10]:
            do.append(d[0])
            do.append(d[1])
        for i in range(0, 10 - len(ds[0:10])):
            do.append("")
            do.append("")
        future_dict = sub_dict(future_dict, now_dict)

        do.append(cosine_sim(now_dict, before_dict))
        do.append(cosine_sim(now_dict, all_dict))
        do.append(cosine_sim(now_dict, past_dict))
        do.append(cosine_sim(now_dict, future_dict))
        do.append(cosine_sim(now_dict, first_dict))
        do.append(cosine_sim(now_dict, peak_dict))
        do.append(cosine_sim(now_dict, dur_dict))

        do.append(entropy(now_dict))
        do.append(0)

        dout.append(do)

        before_dict = now_dict
        past_dict = add_dict(past_dict, now_dict)

        b += threemonth

    # Calculate normalized_entropy
    maxv = 0
    for i in range(1, len(dout)):
        maxv = max(maxv, dout[i][column_entropy])
    if maxv != 0:
        for i in range(1, len(dout)):
            dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv)

    return dout
Beispiel #2
0
    print 'DB: {0}'.format(args.db_file_name)
    if not args.xlsx_file_name:
        args.xlsx_file_name = args.db_file_name.split('.')[0] + '.xlsx'
    print 'Excel workbook: {0}'.format(args.xlsx_file_name)
    print '# of months / period: {0}'.format(args.month_delta)
    print '# of top word: {0}'.format(args.lim_word)
    print '# of high score mentions: {0}'.format(args.lim_hscore)
    print '-' * 40

    # Create DB handler and xlsx workbook
    xh = xlsx.XlsxHandler(args.db_file_name, args.xlsx_file_name,
                          args.month_delta)

    try:
        # Set duration info to database
        duration_event.get_duration(args.db_file_name)
        # Set peak_mention info to database
        set_peak_to_info.set_peak(args.db_file_name)

        if int(xh.info_dic["event_doc_num"]) == 0:
            raise sqlite3.OperationalError(
                'int(xh.info_dic["event_doc_num"]) == 0')

        # Write sheets
        print 'Writing "info" sheet'
        xh.write_info()

        print "Writing entity_count sheet"
        dout = get_entity_count.get_entity_count(args.db_file_name)
        xh.write_csv_date("entity_count", dout)
Beispiel #3
0
    def make_out_data(get_data_func):
        ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date)

        threemonth = datetime.timedelta(days=92)
        dout = ["period"]

        dout.append("tf_idf_one_to_before_one_cosine_sim")
        dout.append("tf_idf_one_to_all_cosine_sim")
        dout.append("tf_idf_one_to_all_before_cosine_sim")
        dout.append("tf_idf_one_to_all_future_cosine_sim")
        dout.append("tf_idf_one_to_first_three_mounth_sim")
        dout.append("tf_idf_one_to_peak_three_mounth_sim")
        dout.append("tf_idf_one_to_duration_sim")

        dout = [dout]

        b = begin_event_date
        e = end_date
        peak_mention = get_entity_count.peak_period_mention(ndb)
        peak_begin = 0
        peak_end = 0
        while b < e:
            if b <= peak_mention and peak_mention <= b + threemonth:
                peak_begin = b
                peak_end = b + threemonth
            b += threemonth

        b = begin_event_date
        e = end_date
        dur = duration_event.get_duration(ndb)
        while b < e:
            print "tf_idf_sheet-second-loop", b
            do = [dt.date2str(b)]

            now_tiv = ti.get_tf_idf_vector(b, b + threemonth)

            # tf_idf_one_to_before_one_cosine_sim
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(b - threemonth, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_cosine_sim
            do.append(cosine_sim(now_tiv, ti.tf_idf_all))

            # tf_idf_one_to_all_before_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(begin_event_date, b))
            print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(begin_event_date, b) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(begin_event_date, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_future_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e))
            print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(b + threemonth, e) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            # print c
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e)))

            # tf_idf_one_to_first_three_mounth_sim
            do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector(
                begin_event_date, begin_event_date + threemonth)))

            # tf_idf_one_to_peak_three_mounth_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end)))

            # tf_idf_one_to_duration_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(dur[0], dur[1])))

            dout.append(do)

            b += threemonth

        return dout
Beispiel #4
0
def ndb_to_xlsx_data(ndb):
    pickle_name = ndb + ".pickle"
    if os.path.exists(pickle_name):
        return

    xlsx_data = []
    try:
        # Set duration info to database
        dur = duration_event.get_duration(ndb)
        # Set peak_mention info to database
        set_peak_to_info.set_peak(ndb)

        dh = db.DbHandler(ndb)
        if int(dh.get_info_dic()["event_doc_num"]) == 0:
            print 'int(xh.info_dic["event_doc_num"]) == 0'
            raise sqlite3.OperationalError(
                'int(xh.info_dic["event_doc_num"]) == 0')

        print "Start for ", ndb

        print "Writing entity_count sheet"
        dout = get_entity_count.get_entity_count(ndb)
        xlsx_data.append(("entity_count", (dout, dur)))

        print "Writing various_name sheet"
        dout = various_name.make_csv_data(ndb)
        xlsx_data.append(("various_name", (dout, dur)))

        print "Writing #articles_for_each_query sheet"
        dout = query_num_sheet.make_csv_data(ndb)
        xlsx_data.append(("articles_for_each_query", (dout, dur)))

        print "Writing top_entities_list_person sheet"
        dout = get_top_entities_list.get_sheet(ndb, "PERSON")
        xlsx_data.append(("top_entities_list_person", (dout, dur)))

        print "Writing top_entities_list_location"
        dout = get_top_entities_list.get_sheet(ndb, "LOCATION")
        xlsx_data.append(("top_entities_list_location", (dout, dur)))

        print "Writing top_entities_list_organization"
        dout = get_top_entities_list.get_sheet(ndb, "ORGANIZATION")
        xlsx_data.append(("top_entities_list_organization", (dout, dur)))

        print "Writing top_entities_list_date"
        dout = get_top_entities_list.get_sheet(ndb, "DATE")
        xlsx_data.append(("top_entities_list_date", (dout, dur)))

        print "Writing tf_idf sheet."
        dout = tf_idf_sheet.make_tf_idf_sheet(ndb)
        xlsx_data.append(("tf_idf", (dout, dur)))

        print 'Done', ndb, '\n\n'

    except:
        traceback.print_exc()
        print "Cannot make xlsx. because"
        print "Remove trash xlsx file\n\n"

    with open(pickle_name, 'w') as f:
        pickle.dump(xlsx_data, f)

    return xlsx_data