Exemple #1
0
def process_zip_file():

    csv = DATA_DIR + '20200215224500.gkg.csv'

    logging.info("Reading " + csv)
    df = pd.read_csv(csv,
                     header=0,
                     sep='\t',
                     names=GKG_COLUMN_NAMES,
                     encoding='unicode_escape')
    logging.info("gkg df shape " + str(df.shape))

    # append relevant gkg lines to army df
    # Grab rows that contain 'army' in the url column
    # fdf = df[(df['V1ORGANIZATIONS'].str.contains(ARMY_REGEX, case=False) == True) |
    #                   (df['V2ENHANCEDORGANIZATIONS'].str.contains(
    #                       ARMY_REGEX, case=False) == True)
    #         ]

    # logging.info("fdf shape " + str(fdf.shape))
    #print(str(fdf.head()))

    # update the processed files list
    # with open(FILES_PROCESSED_LIST, "a") as f:
    #     f.write(zip_file_url + "\n")

    # delete the zip file
    # if os.path.exists(zip_file):
    #     logging.info("removing " + zip_file)
    #     os.remove(zip_file)
    # else:
    #     logging.info(zip_file + " missing.")

    print(df.head())
Exemple #2
0
def load_files_df(df_file):

    try:
        logging.info("Reading " + df_file)
        df = pd.read_table(df_file, sep=' ', usecols=[0,1,2], names=['id', 'checksum', 'url'], header=None)
        
    except Exception as e: 
        logging.critical("Not parsed: " + df_file + "\n" + str(e))
        sys.exit()   

    return df
Exemple #3
0
def download(url):

    logging.info("Downloading " + url)
    file_name = fname_from_url(url)

    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = get(url)
        # write to file
        file.write(response.content)
Exemple #4
0
def main():

    NUM_TABLES = 1
    TEMPLATE_FILE = HTML_DIR + "gchart_template.html"
    HISTOGRAM_FILE1 = HISTOGRAM_DIR + "LocationsHistogram.csv"
    TITLE = "LOCATIONS"
    OUTFILE = HTML_DIR + "locations.html"
    TABLE1 = "V1LOCATIONS counts"

    # Read histogram into list
    logging.info("reading " + HISTOGRAM_FILE1)
    with open(HISTOGRAM_FILE1) as f:
        gcam_lines = f.read().splitlines()   

    # build histo dict
    hist_dict = {}
    for line in gcam_lines:
        entries = line.split("\t")
        location = entries[0].strip()
        score = int(entries[1].strip())
        hist_dict[location] = score

    # build the datatable
    dt = '["Feature", "Score"],\n'
    for key, value in hist_dict.items():
        dt += f'["{key}", {value}],\n'

    print(dt)

    # build the title line
    title_option = f'title: "{TABLE1}",\n'
    width_option = 'width: 600,\n'
    height_option = f'height: 100000,\n'
    bar_option = f'bar: 150,'
    options = title_option + width_option + height_option + bar_option
    print(options)

    # read the template
    with open(TEMPLATE_FILE) as f:
        html = f.read()

    html = html.replace("//%DATA_TABLE", dt, 1)
    html = html.replace("//%OPTIONS", options, 1)

    logging.info("writing " + OUTFILE)
    with open(OUTFILE, 'w') as f:
        f.write(html)
Exemple #5
0
def process_zip_file(zip_file_url):

    zip_file = fname_from_url(zip_file_url)

    try:
        logging.info("Reading " + zip_file)

        #requires unicode escape for some files
        df = pd.read_csv(zip_file,
                         compression='zip',
                         header=0,
                         sep='\t',
                         names=GKG_COLUMN_NAMES,
                         encoding='unicode_escape')

        logging.info("gkg df shape " + str(df.shape))

        # append relevant gkg lines to army df
        # Grab rows that contain 'army' in the url column
        fdf = df[(
            df['V1ORGANIZATIONS'].str.contains(ARMY_REGEX, case=False) == True)
                 | (df['V2ENHANCEDORGANIZATIONS'].str.
                    contains(ARMY_REGEX, case=False) == True)]

        logging.info("fdf shape " + str(fdf.shape))
        #print(str(fdf.head()))

        # update the processed files list
        with open(FILES_PROCESSED_LIST, "a") as f:
            f.write(zip_file_url + "\n")

        # delete the zip file
        if os.path.exists(zip_file):
            logging.info("removing " + zip_file)
            os.remove(zip_file)
        else:
            logging.info(zip_file + " missing.")

    except Exception as e:
        logging.error("Problem reading " + zip_file)

    return fdf
Exemple #6
0
def build_gkg_urls(master_list):
    
    logging.info("Loading GDELT Master File List")
    gdelt_list_df = load_files_df(master_list)
    logging.info("gdelt_list_df " + str(gdelt_list_df.shape))

    regex = '202002.{9}gkg'
    #regex = '2020021001.{5}gkg'
    logging.info("filtering on " + regex)

    # Grab rows that contain 'gkg' in the url column
    gdelt_list_df = gdelt_list_df[gdelt_list_df['url'].str.contains(
        regex) == True]

    # Save the url column only
    gdelt_list_df = gdelt_list_df['url']
    logging.info("filtered list length: " + str(gdelt_list_df.shape[0]))

    # write the gkg file
    with open(GKG_FILE_LIST, 'w') as f:
        logging.info("writing " + GKG_FILE_LIST)
        gdelt_list_df.to_csv(f, index=False, sep="\t", header=False)
Exemple #7
0
def main():

    file_list = glob.glob(DATA_DIR + "army_gkg*.csv")

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    for f in file_list:
        print(f)

        # read the file into temp df
        tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES)

        tdf['V2.1DATE'] = tdf['V2.1DATE'].astype(str)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    

    # create a datetime column on the full df
    df['ymd'] = df.apply (lambda row: make_date(row), axis=1)
    df['Datetime'] = pd.to_datetime(df['ymd'], format='%Y-%m-%d')
    #df = df.set_index(pd.DatetimeIndex(df['Datetime']), drop=True)
    df = df.drop(['ymd'], axis=1)

    #print(df.head(500))

    # group full df by day
    daygroups = df.groupby(['Datetime'])
    logging.info("Groups: " + (str(daygroups.describe())))

    # for each group write the output file (do not write the date or the index)
    for name, group in daygroups:
        tname = str(name)
        out_fname = "Army_GKG_by_day_" + tname[:10] + ".csv"
        logging.info("Writing group to " + out_fname)
        group.to_csv(out_fname, index=False, header=False, sep="\t")
Exemple #8
0
def main():

    ans = ""
    while (ans != 'y') and (ans != 'n'):
        ans = input("Download new GDELT Master File List? [y/n] ")
        ans = ans.lower().strip()

    master_url = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
    file_name = DATA_DIR + fname_from_url(master_url)

    if ans == 'y':

        logging.info("Downloading master file list.")

        with open(file_name, "wb") as file:
            # get request
            response = get(master_url)
            # write to file
            file.write(response.content)

    else:
        logging.info("Using existing master file list.")

    build_gkg_urls(file_name)
Exemple #9
0
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")

    for f in file_list:

        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f,
                          header=0,
                          sep='\t',
                          names=GKG_COLUMN_NAMES,
                          index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the column
    items_list = df['V2ENHANCEDPERSONS'].tolist()
    #print(f"themes list length: " + str(len(themes_list)))
    #print(themes_list[10])

    items_dict = {}
    for line in items_list:  # line has the content from the GKG cell
        l = line.strip()
        items = l.split(";")  # list of top level entities
        for i in items:
            i = i.strip()
            if len(i) > 1:
                ii = i.split(',')[0]  # subfield of entity
                ii = ii.strip()
                if len(ii) > 1:
                    if ii in items_dict:
                        items_dict[ii] += 1
                    else:
                        items_dict[ii] = 1

    # add the V2 Themes

    #print(str(themes_dict))
    items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index')
    items_hist_df = items_hist_df.sort_values(by=0, ascending=False)

    #print(str(themes_hist_df.head(500)))
    outfile = ARMY_GKG_DAILY_DIR + "PersonsV2Histogram.csv"
    logging.info("writing " + outfile)
    items_hist_df.to_csv(outfile, header=False, sep="\t")
Exemple #10
0
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")

    for f in file_list:

        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f,
                          header=0,
                          sep='\t',
                          names=GKG_COLUMN_NAMES,
                          index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the V1THEMES column
    themes_list = df['V1THEMES'].tolist()
    #print(f"themes list length: " + str(len(themes_list)))
    #print(themes_list[10])

    themes_dict = {}
    for line in themes_list:
        l = line.strip()
        themes = l.split(";")
        for t in themes:
            t = t.strip()
            if len(t) > 1:
                tn = t.split(',')[0]
                if tn in themes_dict:
                    themes_dict[tn] += 1
                else:
                    themes_dict[tn] = 1

    # add the V2 Themes

    #print(str(themes_dict))
    themes_hist_df = pd.DataFrame.from_dict(themes_dict, orient='index')
    themes_hist_df = themes_hist_df.sort_values(by=0, ascending=False)

    #print(str(themes_hist_df.head(500)))
    outfile = ARMY_GKG_DAILY_DIR + "ThemesHistogram.csv"
    logging.info("writing " + outfile)
    themes_hist_df.to_csv(outfile, header=False, sep="\t")
Exemple #11
0
def build_url_queue(use_failed_files):

    if use_failed_files:
        input_file_list = FAILED_FILES_LIST
    else:
        input_file_list = GKG_FILE_LIST

    # Read the gkg file list
    gkg_list = []
    try:
        with open(input_file_list) as f:
            gkg_list = f.read().splitlines()

    except EnvironmentError:
        logging.info(input_file_list + " not read")

    # Read the processed files list if it exists
    logging.info("reading processed files list")
    processed_files = []
    try:
        with open(FILES_PROCESSED_LIST) as f:
            processed_files = f.read().splitlines()
            f.close()

    except EnvironmentError:
        logging.info("processed files list not read")

    logging.info("processed files length: " + str(len(processed_files)))

    # Build the queue by comparing the processed files list with the gkg list
    files_queue = []
    for url in gkg_list:
        if url not in processed_files:
            files_queue.append(url)

    # delete the failed files list
    if os.path.exists(FAILED_FILES_LIST):
        os.remove(FAILED_FILES_LIST)

    return files_queue
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")

    for f in file_list:
        
        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, 
            index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)


    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the V1PERSONS column
    item_list = df['V1PERSONS'].tolist()


    items_dict = {}
    for line in item_list:
        l = line.strip()
        items = l.split(";")
        for i in items:
            i = i.strip()
            if len(i) > 1:
                if i in items_dict:
                    items_dict[i] += 1
                else:
                    items_dict[i] = 1

    
    #print(str(themes_dict))
    items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index')
    items_hist_df = items_hist_df.sort_values(by=0, ascending=False)

    #print(str(themes_hist_df.head(500)))
    outfile = ARMY_GKG_DAILY_DIR + "PersonsHistogram.csv"
    logging.info("writing " + outfile)
    items_hist_df.to_csv(outfile, header=False, sep="\t")
Exemple #13
0
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")
    #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv")

    for f in file_list:

        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f,
                          header=0,
                          sep='\t',
                          names=GKG_COLUMN_NAMES,
                          index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    logging.info("consolidated df shape: " + str(df.shape))

    # -------------------------------- DEBUG ------------------------------
    # persons_rows = []
    # persons_rows.append("n1; n2; n3; n4")
    # persons_rows.append("n1; n3; n5; n6")
    gkg_data = [
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n1; n2; n3; n4', 'col13', 'col14',
            'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21',
            'col22', 'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n1; n3; n5; n6', 'col13', 'col14',
            'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21',
            'col22', 'col23', 'col24', 'col25', 'col26', 'col27'
        ],
    ]

    # create blank full df
    #df = pd.DataFrame(gkg_data, columns=GKG_COLUMN_NAMES)

    # ------ create person nodes (TODO this can be made faster) --------
    persons_rows = df['V1PERSONS'].tolist()
    persons_dict = {}

    # assign person IDs
    pid = 0
    persons_set = set()
    for line in persons_rows:
        persons_set = get_persons_set(line)

        for person in persons_set:
            if person in persons_dict.keys():
                values = persons_dict[person]
                node_size = values[0]
                person_id = values[1]
                persons_dict[person] = [node_size + 1, person_id]

            else:
                persons_dict[person] = [1, pid]
                pid += 1

    # convert, sort and write person nodes file
    plist = []
    for key, value in persons_dict.items():
        pl = [value[1], key, value[0]]  # id, label, nodesize
        plist.append(pl)

    node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value'])
    node_list_df = node_list_df.sort_values(by=['value'], ascending=False)
    #print(str(node_list_df.head(20)))

    nodesfile = GRAPH_DIR + "PersonsNodes.csv"
    logging.info("writing " + nodesfile)
    node_list_df.to_csv(nodesfile, header=True, index=False, sep=",")

    # -------------------- Build edge list ------------------------------------

    # make persons column into set
    df['V1PERSONS'] = df.apply(lambda row: make_persons_pairs(row.V1PERSONS),
                               axis=1)

    # pdf = df[['V1PERSONS']].copy()
    # print(pdf.head())
    logging.info('creating pairs')
    pairs = []
    persons_list = df['V1PERSONS'].tolist()
    for row in persons_list:
        for pair in row:
            pair_string = str(pair[0]) + ", " + str(pair[1])
            pairs.append(pair_string)

    # make one column with all the pairs and value counts
    logging.info('building edge df')
    pdf = pd.DataFrame()
    pdf["values"] = pairs
    vc = pdf["values"].value_counts()
    edge_df = pd.DataFrame(vc)
    #print("edge df: ")
    #print(edge_df.head(20))

    #print(str(df.head()))
    #print(str(df['V1PERSONS']))

    edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv"
    logging.info("writing " + edgefile)
    edge_df.to_csv(edgefile, header=False, index=True, sep=",")

    # Write the edge list using node ids instead of names
    edge_df['id1'] = edge_df.apply(
        lambda row: names_to_ids(persons_dict, row, 0), axis=1)
    edge_df['id2'] = edge_df.apply(
        lambda row: names_to_ids(persons_dict, row, 1), axis=1)
    print(edge_df)

    edgeidfile = GRAPH_DIR + "PersonsEdgeIDs.csv"
    logging.info("writing " + edgeidfile)

    edge_df.to_csv(edgeidfile,
                   columns=['id1', 'id2', 'values'],
                   header=True,
                   index=False,
                   sep=",")
Exemple #14
0
def main():

    TITLE = "GKG Persons"
    entity_name = "persons"
    search_subtitle = "Filtered on: Organization = US Army  (5464 entries)"
    date_subtitle = "1-FEB-2020 through 16-FEB-2020"

    NUM_TABLES = 2

    HISTOGRAM_FILE1 = HISTOGRAM_DIR + "PersonsHistogram.csv"
    TABLE1_TITLE = "V1PERSONS occurrences"

    HISTOGRAM_FILE2 = HISTOGRAM_DIR + "PersonsV2Histogram.csv"
    TABLE2_TITLE = "V2ENHANCEDPERSONS occurrences"

    OUTFILE = HTML_DIR + "persons.html"
    MAX_BAR_LENGTH = 200

    # HTML Start
    html = write_header(TITLE)
    html.append(f'<h1> {TITLE}</h1>')
    html.append(f'<h2> {date_subtitle}</h2>')
    html.append(f'<h2> {search_subtitle}</h2>')
    html.append(' <div class="flexrow">')

    # TABLE 1

    # Read histogram into list
    logging.info("reading " + HISTOGRAM_FILE1)
    with open(HISTOGRAM_FILE1) as f:
        lines = f.read().splitlines()

    # get the highest score
    pair = lines[0].split("\t")
    high_score = int(pair[1].strip())

    # build histo1 dict
    hist_dict = {}
    for line in lines:
        entries = line.split("\t")
        location = entries[0].strip()
        score = int(entries[1].strip())
        hist_dict[location] = score

    html.append('  <div class="flexcol"> <table>')
    t1caption = TABLE1_TITLE + f" ({len(hist_dict)} {entity_name})"
    html.append(f'   <caption>{t1caption}</caption>')

    # one table row for each dictionary entry
    for key, value in hist_dict.items():

        barlength = (value / high_score) * MAX_BAR_LENGTH
        barlength = math.ceil(barlength)

        left = barlength + 5

        html.append('   <tr>')
        html.append('     <td>')
        html.append(f'      <div class="feature">{key}</div>')
        html.append('     </td>')
        html.append('     <td>')
        html.append(
            f'      <div class="score-bar" style="width:{barlength}px;">')
        html.append(
            f'      <p class="score" style="left: {left}px;">{value}</p></div>  '
        )
        html.append('     </td>')
        html.append('   </tr>')

    html.append('  </table> </div>')

    # TABLE 2
    # Read histogram into list
    logging.info("reading " + HISTOGRAM_FILE2)
    with open(HISTOGRAM_FILE2) as f:
        lines = f.read().splitlines()

    # get the highest score
    pair = lines[0].split("\t")
    high_score = int(pair[1].strip())

    # build histo dict
    hist_dict = {}
    for line in lines:
        entries = line.split("\t")
        location = entries[0].strip()
        score = int(entries[1].strip())
        hist_dict[location] = score

    t2caption = TABLE2_TITLE + f" ({len(hist_dict)} {entity_name})"
    html.append('  <div class="flexcol"> <table>')
    html.append(f'   <caption>{t2caption}</caption>')

    # one table row for each dictionary entry
    for key, value in hist_dict.items():

        barlength = (value / high_score) * MAX_BAR_LENGTH
        barlength = math.ceil(barlength)

        left = barlength + 5

        html.append('   <tr>')
        html.append('     <td>')
        html.append(f'      <div class="feature">{key}</div>')
        html.append('     </td>')
        html.append('     <td>')
        html.append(
            f'      <div class="score-bar" style="width:{barlength}px;">')
        html.append(
            f'      <p class="score" style="left: {left}px;">{value}</p></div>  '
        )
        html.append('     </td>')
        html.append('   </tr>')

    html.append('  </table> </div>')

    html.append(' </div>')  # f,exrow

    # CLOSING HTML
    footer = write_footer()
    for line in footer:
        html.append(line)

    logging.info("writing " + OUTFILE)
    with open(OUTFILE, 'w') as f:
        for line in html:
            f.writelines(line + "\n")
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")
    #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv")

    for f in file_list:

        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f,
                          header=0,
                          sep='\t',
                          names=GKG_COLUMN_NAMES,
                          index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    logging.info("consolidated df shape: " + str(df.shape))

    #-------------------------------- DEBUG ------------------------------
    persons_rows = []
    persons_rows.append("n1; n2; n3; n4")
    persons_rows.append("n1; n3; n5; n6")
    gkg_data = [
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n1; n2; n3; n4', 'col13', 'col14',
            'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21',
            'col22', 'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n1; n3; n5; n6', 'col13', 'col14',
            'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21',
            'col22', 'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n6; n7', 'col13', 'col14', 'col15',
            'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22',
            'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n7; n8', 'col13', 'col14', 'col15',
            'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22',
            'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n8; n9', 'col13', 'col14', 'col15',
            'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22',
            'col23', 'col24', 'col25', 'col26', 'col27'
        ],
        [
            'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
            'col9', 'col10', 'col11', 'n3; n10', 'col13', 'col14', 'col15',
            'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22',
            'col23', 'col24', 'col25', 'col26', 'col27'
        ],
    ]

    # create blank full df
    #df = pd.DataFrame(gkg_data, columns=GKG_COLUMN_NAMES)

    # ------ create person nodes (TODO this can be made faster) --------
    persons_rows = df['V1PERSONS'].tolist()
    persons_dict = {}

    # assign person IDs
    pid = 0
    persons_set = set()
    for line in persons_rows:
        persons_set = get_persons_set(line)

        for person in persons_set:
            if person in persons_dict.keys():
                values = persons_dict[person]
                node_size = values[0]
                person_id = values[1]
                persons_dict[person] = [node_size + 1, person_id]

            else:
                persons_dict[person] = [1, pid]
                pid += 1

    # convert, sort and write person nodes file
    plist = []
    for key, value in persons_dict.items():
        pl = [value[1], key, value[0]]  # id, label, nodesize
        plist.append(pl)

    node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value'])
    node_list_df = node_list_df.sort_values(by=['value'], ascending=False)
    #print(str(node_list_df.head(20)))

    nodesfile = GRAPH_DIR + "PersonsNodes.csv"
    logging.info("writing " + nodesfile)
    node_list_df.to_csv(nodesfile, header=True, index=False, sep=",")

    # -------------------- Build edge list ------------------------------------

    # make persons column into set
    df['V1PERSONS'] = df.apply(lambda row: make_persons_pairs(row.V1PERSONS),
                               axis=1)

    # pdf = df[['V1PERSONS']].copy()
    # print(pdf.head())
    logging.info('creating pairs')
    pairs = []
    persons_list = df['V1PERSONS'].tolist()
    for row in persons_list:
        for pair in row:
            pair_string = str(pair[0]) + ", " + str(pair[1])
            pairs.append(pair_string)

    # make one column with all the pairs and value counts
    logging.info('building edge df')
    pdf = pd.DataFrame()
    pdf["values"] = pairs
    vc = pdf["values"].value_counts()
    edge_df = pd.DataFrame(vc)
    #print("edge df: ")
    #print(edge_df.head(20))

    #print(str(df.head()))
    #print(str(df['V1PERSONS']))

    edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv"
    logging.info("writing " + edgefile)
    edge_df = edge_df.sort_values(by=['values'], ascending=False)
    edge_df.to_csv(edgefile, header=False, index=True, sep=",")

    # Add node ids as columns
    edge_df['id1'] = edge_df.apply(
        lambda row: names_to_ids(persons_dict, row, 0), axis=1)
    edge_df['id2'] = edge_df.apply(
        lambda row: names_to_ids(persons_dict, row, 1), axis=1)
    #print(edge_df)

    edgeidfile = GRAPH_DIR + "PersonsEdgeIDs.csv"
    logging.info("writing " + edgeidfile)

    # write ids edge list
    edge_df = edge_df.sort_values(by=['values'], ascending=False)
    edge_df.to_csv(edgeidfile,
                   columns=['id1', 'id2', 'values'],
                   header=True,
                   index=False,
                   sep=",")

    # ----------- Build graphs for top-N node values (number of appearances in doc set)
    logging.info("Building Top-N Lists")
    TOP_N = 10  #
    LINK_VALUE_CUTOFF = 50
    shortlist_df = node_list_df[:TOP_N]

    # keep edge df rows where one of the ids is in the short list

    short_ids = shortlist_df['id'].apply(str).tolist()
    #print(str(short_ids))

    edges_short = edge_df.loc[(edge_df['id1'].isin(short_ids)) |
                              (edge_df['id2'].isin(short_ids))]
    edges_short = edges_short[edges_short['values'] > LINK_VALUE_CUTOFF]

    # write the shortlist
    nodesfile = GRAPH_DIR + "PersonsEdgeListTopN.csv"
    logging.info("writing " + nodesfile)
    edges_short = edges_short.sort_values(by=['values'], ascending=False)
    edges_short.to_csv(nodesfile,
                       header=True,
                       index=False,
                       sep=",",
                       columns=['id1', 'id2', 'values'])

    # add the connected nodes to the persons shortlist
    # collect new ids
    logging.info("adding adjacent nodes to top N")
    has_new_node = edges_short.loc[~edges_short['id1'].isin(short_ids)
                                   | ~edges_short['id2'].isin(short_ids)]

    # keep high-link-strength edges only
    has_new_node = has_new_node[has_new_node['values'] > LINK_VALUE_CUTOFF]

    #print(has_new_node)

    ids_set = set(has_new_node['id1'].tolist() + has_new_node['id2'].tolist())

    #print(ids_set)

    #print(node_list_df)
    logging.info("building adjacent nodes df")
    plist = []
    idx = 0
    for nodeid in ids_set:
        idx += 1
        if nodeid in shortlist_df['id'].apply(str).tolist():
            pass
        else:
            label = node_list_df[node_list_df['id'].apply(str) ==
                                 nodeid]['label'].item()
            value = node_list_df[node_list_df['id'].apply(str) ==
                                 nodeid]['value'].item()
            pl = [nodeid, label, value]
            plist.append(pl)

        if (idx % 100) == 0:
            print("processed rows: " + str(idx))

    adjacent_nodes = pd.DataFrame(plist, columns=['id', 'label', 'value'])
    #print(adjacent_nodes)

    shortlist_df = shortlist_df.append(adjacent_nodes)

    # write the short nodes list to file
    nodesfile = GRAPH_DIR + "PersonsNodesTopN.csv"
    logging.info("writing " + nodesfile)
    shortlist_df = shortlist_df.sort_values(by=['value'], ascending=False)
    shortlist_df.to_csv(nodesfile,
                        header=True,
                        index=False,
                        sep=",",
                        columns=['id', 'label', 'value'])
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")

    for f in file_list:

        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f,
                          header=0,
                          sep='\t',
                          names=GKG_COLUMN_NAMES,
                          index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)

    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the column
    items_list = df['V2EXTRASXML'].tolist()
    #print(f"themes list length: " + str(len(themes_list)))
    #print(themes_list[10])

    # stop_words
    with open(STOPWORDS_FILE) as f:
        stopwords = f.read().splitlines()

    punctdict = {
        ".": "",
        ",": "",
        "?": "",
        "#": "",
        "$": "",
        "!": "",
        "&": "",
        "*": "",
        "(": "",
        ")": "",
        '"': "",
        ":": "",
        ";": ""
    }

    items_dict = {}

    tag = "<PAGE_TITLE>"
    endtag = "</PAGE_TITLE>"
    table = str.maketrans(dict.fromkeys(
        string.punctuation))  # for punctuation removal
    for line in items_list:  # line has the content from the GKG cell
        l = line.strip()

        if tag in l:
            title = line[l.find(tag) + 12:l.find(endtag)]
            #print(title)
            words = title.split(" ")
            for w in words:
                w = w.strip()
                w = w.translate(table)  # remove punctuation
                if (len(w) > 1) and (w.lower() not in stopwords):
                    if w in items_dict:
                        items_dict[w] += 1
                    else:
                        items_dict[w] = 1

    #print(str(themes_dict))
    items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index')
    items_hist_df = items_hist_df.sort_values(by=0, ascending=False)

    #print(str(themes_hist_df.head(500)))
    outfile = ARMY_GKG_DAILY_DIR + "TitleWordsHistogram.csv"
    logging.info("writing " + outfile)
    items_hist_df.to_csv(outfile, header=False, sep="\t")
Exemple #17
0
def main():

    USE_FAILED_FILES_LIST = False

    startTime = pd.Timestamp('now')
    logging.info("ANTS run started at " + str(startTime))

    # elapsed time working on the current output file
    fileTime = pd.Timestamp('now')

    # add time to output file to prevent overwrites
    timestr = time.strftime("%Y%m%d-%H%M%S")
    outfile = OUTPUT_FILE_PRE + timestr + ".csv"

    # Build the input queue
    files_queue = build_url_queue(USE_FAILED_FILES_LIST)

    if len(files_queue) > 0:
        logging.info(str(len(files_queue)) + " files in queue")
    else:
        logging.info("NO UNPROCESSED FILES IN QUEUE")

    files_processed = 0
    army_gkg_events = 0
    skipped_files = 0
    for zip_file_url in files_queue:

        # Download a file from the queue
        try:
            # UNCOMMENT
            download(zip_file_url)

            fdf = process_zip_file(zip_file_url)

            army_gkg_events += fdf.shape[0]

            # create a new file after an hour
            elapsed_time = pd.Timestamp('now') - fileTime
            if elapsed_time.seconds > 3600:
                timestr = time.strftime("%Y%m%d-%H%M%S")
                outfile = OUTPUT_FILE_PRE + timestr + ".csv"
                fileTime = pd.Timestamp('now')

            fdf.to_csv(outfile,
                       mode='a',
                       header=False,
                       sep='\t',
                       na_rep=' ',
                       index=False)
            logging.info("Wrote results to " + outfile)

            files_processed += 1
            logging.info(f'Completed {files_processed} files')
            logging.info("ARMY GKG EVENTS SO FAR: " + str(army_gkg_events))

        except Exception as e:

            logging.error("Problem processing " + zip_file_url)

            skipped_files += 1

            with open(FAILED_FILES_LIST, "a") as failedfile:
                failedfile.write(zip_file_url + "\n")

            # delete the zip file
            badfile = fname_from_url(zip_file_url)
            if os.path.exists(badfile):
                os.remove(badfile)

    endTime = pd.Timestamp('now')
    logging.info("ANTS run finished at " + str(endTime))
    logging.info("Elapsed time: " + str(endTime - startTime))
    logging.info(f"Processed {files_processed} files.")
    logging.info(f"Skipped {skipped_files} files.")
    logging.info(f"Found {army_gkg_events} relevant gkg events")
Exemple #18
0
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")
    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv")

    for f in file_list:
        
        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, 
            index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)


    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the V1PERSONS column
    persons_rows = df['V1PERSONS'].tolist()


    # DEBUG
    # persons_rows = []
    # persons_rows.append("n1; n2; n3; n4")
    # persons_rows.append("n1; n3; n5; n6")
    # persons_rows.append("n1; n3;")
    
    

    persons_dict = {}

    # assign person IDs
    pid = 0
    persons_set = set()
    for line in persons_rows:
        persons_set = get_persons_set(line)

        for person in persons_set:
            if person in persons_dict.keys():
                values = persons_dict[person]
                node_size = values[0]
                person_id = values[1]
                persons_dict[person] = [node_size + 1, person_id]

            else:
                persons_dict[person] = [1, pid]
                pid += 1


    # convert, sort and write person nodes file
    plist = []
    for key, value in persons_dict.items():   
        pl = [value[1], key, value[0]]
        plist.append(pl)

    node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value'])
    node_list_df = node_list_df.sort_values(by=['value'], ascending=False)
    print(str(node_list_df.head(20)))   

    nodesfile = GRAPH_DIR + "PersonsNodes.csv"
    logging.info("writing " + nodesfile)
    node_list_df.to_csv(nodesfile, header=True, index=False, sep=";")    



    # build edge list
    logging.info("building edge list")
    big_edge_list = []
    rows_processed = 0
    for line in persons_rows:
        doc_edge_list = make_doc_edge_list(persons_dict, line)
        #write_edge_list(doc_edge_list)
        # SLOW?
        big_edge_list = update_edge_list(doc_edge_list, big_edge_list)

        rows_processed += 1
        if (rows_processed % 10) == 0:
            logging.info("rows processed: " + str(rows_processed))
        


    edge_list_df = pd.DataFrame(big_edge_list, columns=['from', 'to', 'strength'])
    edge_list_df = edge_list_df.sort_values(by=['strength'], ascending=False)
    print(str(edge_list_df.head(20)))

    #print(str(themes_hist_df.head(500)))
    edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv"
    logging.info("writing " + edgefile)
    edge_list_df.to_csv(edgefile, header=True, index=False, sep=";")
def main():

    # create blank full df
    df = pd.DataFrame(columns=GKG_COLUMN_NAMES)

    file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv")

    for f in file_list:
        
        logging.info("reading" + f)

        # read the file into temp df
        tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, 
            index_col=False)

        # append temp df to full df
        df = df.append(tdf, ignore_index=True)


    logging.info("consolidated df shape: " + str(df.shape))

    # build a list of the column
    items_list = df['V2GCAM'].tolist()
    
    # Read CodeBook into dictionary
    with open(GCAM_CODEBOOK, errors="backslashreplace") as f:
        gcam_lines = f.read().splitlines()   

    # Build gcam dictionary
    logging.info("building GCAM dictionary")
    gcam_dict = {}
    skip = True  # skip line 1
    for line in gcam_lines:
        if skip:
            skip = False
        else:
            line = line.strip()
            cols = line.split("\t")
            gcam_dict[cols[0]] = cols[6]
            

    items_dict = {}  # This will hold the pairs => gcam_code : total_score

    logging.info("building histogram")
    for line in items_list:   # line has the content from the GKG cell
        l = line.strip()

        entries = l.split(",")
        for entry in entries:
            entry = entry.strip()
            code = entry.split(":")[0]
            score = entry.split(":")[1]

            if code[0] == "v":
                if code in items_dict:
                    items_dict[code] += float(score)
                else:
                    items_dict[code] = float(score)

    # annotate the gcam codes with their human dimension names
    labeled_items_dict = {}
    logging.info("adding dimension labels")
    for key, value in items_dict.items():
        label = gcam_dict[key]
        labeled_items_dict[key + " " + label] = value
    
    
    #print(str(themes_dict))
    logging.info("building dataframe")
    items_hist_df = pd.DataFrame.from_dict(labeled_items_dict, orient='index')
    items_hist_df = items_hist_df.sort_values(by=0, ascending=False)

    #print(str(themes_hist_df.head(500)))
    outfile = ARMY_GKG_DAILY_DIR + "GCAM_Values_Histogram.csv"
    logging.info("writing " + outfile)
    items_hist_df.to_csv(outfile, header=False, sep="\t")
Exemple #20
0
def main():


    TITLE = "GKG Persons Graph (Top 19 Entities)"
    OUTFILE = GRAPH_DIR + "GKG-graph.html"
    search_subtitle = "Filtered on: Organization = US Army  (5464 entries)"
    date_subtitle = "1-FEB-2020 through 16-FEB-2020"

    # load nodes and edges
    node_input_file = GRAPH_DIR + "PersonsNodesTopN.csv"
    edge_input_file = GRAPH_DIR + "PersonsEdgeListTopN.csv"

    # nodes
    with open(node_input_file) as file:
        lines = list(file)

    #  get node scale
    max_node_size = 0
    for line in lines[1:]:
        line = line.strip()
        items = line.split(',')
        node_val_float = float(items[2].strip())
        if node_val_float > max_node_size:
            max_node_size = node_val_float

    print("max node size: " + str(max_node_size))


    nodes = []
    NODE_SCALE = 50
    for line in lines[1:]:
        line = line.strip()
        items = line.split(',')
        node_id = items[0].strip()
        node_value = items[2].strip()
        node_value = round( (float(node_value) / max_node_size) * NODE_SCALE)
        node_label = items[1].strip()
        row = ("{id: " + node_id + ", value: " +
            str(node_value) + ", label: " + "'" + node_label + "'}")
        nodes.append(row)

    #print(str(nodes))

    # edges
    with open(edge_input_file) as file:
        lines = list(file)


    #  get edge scale
    max_edge_size = 0
    for line in lines[1:]:
        line = line.strip()
        items = line.split(',')
        edge_val_float = float(items[2].strip())
        if edge_val_float > max_edge_size:
            max_edge_size = edge_val_float

    print("max edge size: " + str(max_edge_size))

    
    edges = []
    EDGE_SCALE = 10
    for line in lines[1:]:
        line = line.strip()
        items = line.split(',')
        node_from = items[0].strip()
        node_to = items[1].strip()
        edge_value = items[2].strip()
        edge_value = round( (float(edge_value) / max_edge_size) * EDGE_SCALE)
        row = ("{from: " + node_from + ", to: " +
            node_to + ", value: " + str(edge_value) + ", title: " + 
            "'" +  str(edge_value) + "'}")
        edges.append(row)
   
    #print(str(edges))



    # HTML Start
    html = write_header(TITLE, nodes, edges)
    html.append('<body onload="draw()">')

    html.append(f'<h1> {TITLE}</h1>')
    html.append(f'<h2> {date_subtitle}</h2>')
    html.append(f'<h2> {search_subtitle}</h2>')
    html.append('<div id="mynetwork"></div>')


    # CLOSING HTML
    footer = write_footer()
    for line in footer:
        html.append(line)

    logging.info("writing " + OUTFILE)
    with open(OUTFILE, 'w') as f:
        for line in html:
            f.writelines(line + "\n")