def df_ctr(Country): ''' Parameter - Country: name of a country or "The Whole World" | str Return a DataFrame with selected features ''' if Country == 'The Whole World': df_ctr = data.load_df() else: df_ctr = data.load_df()[data.load_df().country == Country] return df_ctr.drop(['eventid', 'latitude', 'longitude'], 1)
def df_occur_by_ctr(ctr_name): ''' Parameter - ctr_name: country name or "The Whole World" | str Return - number of attacks in the chosen country across the whole time series (non-attack years not included) | DataFrame ''' df = data.load_df() if ctr_name == 'The Whole World': df_all_c = df # It is necessary to convert the variable "df_call_c" # to the transposing DataFrame format, # for countries with only one time attack occurrence. # Otherwise will cause bugs else: df_all_c = set_as_index(df, 'country').ix[ctr_name] if type(df_all_c ) == pd.Series: # for those countries with only one attack df_all_c = pd.DataFrame(df_all_c).T df_yr = pd.DataFrame(df_all_c.groupby('year').count().eventid) # rename the 'count' column as 'occurrences' to avoid unexpected potential misusage df_yr.columns = ['occurrences'] return df_yr.reset_index()
def main(): tweetfile = "data/tweets/clean/clean.csv" # df = load_df(tweetfile) # some shitty CLI args = sys.argv[1:] if len(args) < 2: print( "LOL. Please input in the format <loopback value> <word1> <word2> ..." ) print("Example: tweetgen2.py 2 my life") return n = int(args[0]) initial_words = args[1:] mc = MarkovChain(lookback=n) mc.train(load_df(tweetfile)['text'].values.tolist()) # initial_words = ['we', 'tend', 'to'] # initial_words = ['life', 'is'] tweet = mc.generate(initial_words) print("Generated tweet::\n{}".format(tweet)) print('-' * 30) print("After preprocessing <SENTENCE>::\n{}".format( preprocess_sentence(tweet)))
def gtd_country_names(): ''' Return a list of: - all country names in alphabetical order, plus - 'The Whole World' ''' all_ctr = sorted(data.load_df().country.unique()) all_ctr.insert(0, 'The Whole World') return all_ctr
def df_sel_btw_years(year_interval): ''' Parameter - year_interval: Time Interval | tuple Return - all data in the chosen time interval | DataFrame ''' gt_df = data.load_df() df_intv = gt_df[(gt_df.year <= year_interval[1]) & (gt_df.year >= year_interval[0])] return df_intv
def main(): tweetfile = "data/tweets/clean/clean.csv" df = load_df(tweetfile) text = "\n".join(df['text'].values.tolist()).strip() pairs = create_pairs(text) trie = build_trie(pairs) generated_words = generate1(trie, initial_word='i', max_len=15, verbose=False) generated_text = ' '.join(generated_words) print("Generated tweet::\n{}".format(generated_text)) print('-' * 30) print("After preprocessing <SENTENCE>::\n{}".format( preprocess_sentence(generated_text)))
def plot_2D_density(Year, MapStyle): ''' Parameters - Year : between 1970-2015 | str - MapStyle : style palette | str Return A 2D Geo Map: The denser the red marker in a country, the more severe damages had taken place. ''' # use regular expression to check the format if not re.match(r'[\[|\(][0-9]{4}\,\s?[0-9]{4}[\]|\)]$', str(Year)): raise NotIntervalError # the starting year should be less than the ending yer elif Year[0] >= Year[1]: raise IntervalReverseError # catch the missing value exceptions in 1993 elif Year == (1993, 1993): print('Data of 1993 is not available in Global Terrorism Database.\n\ Click the link to learn why.\nhttps://www.start.umd.edu/gtd/faq/') # catch the out of range yer interval input errors elif (Year[0] < 1970) or (Year[1] > 2015): raise IntervalLeakError else: if Year[0] == Year[ 1]: # catch the excetion the starting year and the ending year converge df_gt = data.load_df() df = df_gt[df_gt.year == Year[0]] else: df = ut.df_sel_btw_years(Year) plt.figure(figsize=(18, 10), frameon=False) m = Basemap('mill') m.drawcountries(linewidth=0.5, linestyle='solid', color='white', antialiased=1, ax=None, zorder=None) # Background settings if MapStyle == 'Blue Marble': m.drawcoastlines() m.bluemarble() elif MapStyle == 'Etopo': m.etopo() else: m.drawcoastlines(color='w') m.drawcountries(color='w') m.drawstates(color='w') m.fillcontinents(color='lightblue', lake_color='w') m.drawmapboundary(fill_color='w', color='w') # get latitude and longitude lat = ut.make_array(df, 'latitude') lon = ut.make_array(df, 'longitude') x, y = m(lon, lat) m.plot(x, y, 'r^', marker='o', markersize=4, alpha=.3) plt.title('Global Attack Density Plot: {}-{}'.format(Year[0], Year[1]), size=16) plt.show()