def df_ctr(Country):
    '''
    Parameter
        - Country: name of a country or "The Whole World"   | str
    Return
        a DataFrame with selected features
    '''
    if Country == 'The Whole World':
        df_ctr = data.load_df()
    else:
        df_ctr = data.load_df()[data.load_df().country == Country]
    return df_ctr.drop(['eventid', 'latitude', 'longitude'], 1)
def df_occur_by_ctr(ctr_name):
    '''
    Parameter
         - ctr_name: country name or "The Whole World"      | str
    Return
         - number of attacks in the chosen country
           across the whole time series
           (non-attack years not included)                  | DataFrame
    '''
    df = data.load_df()
    if ctr_name == 'The Whole World':
        df_all_c = df
    # It is necessary to convert the variable "df_call_c"
    # to the transposing DataFrame format,
    # for countries with only one time attack occurrence.
    # Otherwise will cause bugs
    else:
        df_all_c = set_as_index(df, 'country').ix[ctr_name]
        if type(df_all_c
                ) == pd.Series:  # for those countries with only one attack
            df_all_c = pd.DataFrame(df_all_c).T
    df_yr = pd.DataFrame(df_all_c.groupby('year').count().eventid)
    # rename the 'count' column as 'occurrences' to avoid unexpected potential misusage
    df_yr.columns = ['occurrences']
    return df_yr.reset_index()
Esempio n. 3
0
def main():
    tweetfile = "data/tweets/clean/clean.csv"
    # df = load_df(tweetfile)

    # some shitty CLI
    args = sys.argv[1:]
    if len(args) < 2:
        print(
            "LOL. Please input in the format <loopback value> <word1> <word2> ..."
        )
        print("Example: tweetgen2.py 2 my life")
        return
    n = int(args[0])
    initial_words = args[1:]

    mc = MarkovChain(lookback=n)
    mc.train(load_df(tweetfile)['text'].values.tolist())

    # initial_words = ['we', 'tend', 'to']
    # initial_words = ['life', 'is']
    tweet = mc.generate(initial_words)
    print("Generated tweet::\n{}".format(tweet))
    print('-' * 30)
    print("After preprocessing <SENTENCE>::\n{}".format(
        preprocess_sentence(tweet)))
def gtd_country_names():
    '''
    Return a list of:
        - all country names in alphabetical order, plus
        - 'The Whole World'
    '''
    all_ctr = sorted(data.load_df().country.unique())
    all_ctr.insert(0, 'The Whole World')
    return all_ctr
def df_sel_btw_years(year_interval):
    '''
    Parameter
        - year_interval: Time Interval           |   tuple
    Return
        - all data in the chosen time interval   |   DataFrame
    '''
    gt_df = data.load_df()
    df_intv = gt_df[(gt_df.year <= year_interval[1])
                    & (gt_df.year >= year_interval[0])]
    return df_intv
Esempio n. 6
0
def main():
    tweetfile = "data/tweets/clean/clean.csv"
    df = load_df(tweetfile)
    text = "\n".join(df['text'].values.tolist()).strip()
    pairs = create_pairs(text)
    trie = build_trie(pairs)
    generated_words = generate1(trie,
                                initial_word='i',
                                max_len=15,
                                verbose=False)
    generated_text = ' '.join(generated_words)
    print("Generated tweet::\n{}".format(generated_text))
    print('-' * 30)
    print("After preprocessing <SENTENCE>::\n{}".format(
        preprocess_sentence(generated_text)))
def plot_2D_density(Year, MapStyle):
    '''
    Parameters
        - Year      : between 1970-2015     | str
        - MapStyle  : style palette         | str
    Return
        A 2D Geo Map: The denser the red marker in a country,
                      the more severe damages had taken place.
    '''
    # use regular expression to check the format
    if not re.match(r'[\[|\(][0-9]{4}\,\s?[0-9]{4}[\]|\)]$', str(Year)):
        raise NotIntervalError

    # the starting year should be less than the ending yer
    elif Year[0] >= Year[1]:
        raise IntervalReverseError

    # catch the missing value exceptions in 1993
    elif Year == (1993, 1993):
        print('Data of 1993 is not available in Global Terrorism Database.\n\
Click the link to learn why.\nhttps://www.start.umd.edu/gtd/faq/')

    # catch the out of range yer interval input errors
    elif (Year[0] < 1970) or (Year[1] > 2015):
        raise IntervalLeakError

    else:
        if Year[0] == Year[
                1]:  # catch the excetion the starting year and the ending year converge
            df_gt = data.load_df()
            df = df_gt[df_gt.year == Year[0]]
        else:
            df = ut.df_sel_btw_years(Year)

        plt.figure(figsize=(18, 10), frameon=False)

        m = Basemap('mill')
        m.drawcountries(linewidth=0.5,
                        linestyle='solid',
                        color='white',
                        antialiased=1,
                        ax=None,
                        zorder=None)

        # Background settings
        if MapStyle == 'Blue Marble':
            m.drawcoastlines()
            m.bluemarble()
        elif MapStyle == 'Etopo':
            m.etopo()
        else:
            m.drawcoastlines(color='w')
            m.drawcountries(color='w')
            m.drawstates(color='w')
            m.fillcontinents(color='lightblue', lake_color='w')
            m.drawmapboundary(fill_color='w', color='w')

        # get latitude and longitude
        lat = ut.make_array(df, 'latitude')
        lon = ut.make_array(df, 'longitude')

        x, y = m(lon, lat)
        m.plot(x, y, 'r^', marker='o', markersize=4, alpha=.3)

        plt.title('Global Attack Density Plot: {}-{}'.format(Year[0], Year[1]),
                  size=16)
        plt.show()