import pandas as pd from dateutil.parser import parse from datetime import * import read hours = read.load_data() def extract_hours(timestamp): convert = parse(timestamp) hour = convert.hour return hour def extract_day(timestamp): convert = parse(timestamp) day = convert.weekday() return day if __name__ == "__main__": hours["submission_hour"] = hours["submission_time"].apply(extract_hours) hours["submission_day"] = hours["submission_time"].apply(extract_day) print(hours["submission_hour"].value_counts()) print(hours["submission_day"].value_counts())
import read df = read.load_data() print(df[column value].value_counts(ascending=False).head(100))
import pandas as pd import read from collections import Counter data_count = read.load_data() headlines_str = " " for i in data_count["headline"]: add_space = str(i) + " " headlines_str += str(add_space) headlines_str = headlines_str.lower() words = headlines_str.split() top_100 = Counter(words).most_common(100) if __name__ == "__main__": print(top_100)
import pandas as pd import read domains = read.load_data() top_20_domains = domains["url"].value_counts()[:20] if __name__ == "__main__": print(top_20_domains)
def training(): batch_size = 1 nb_classes = 4 nb_epoch = 1 # input image dimensions img_rows, img_cols = 10, 10 # number of convolutional filters to use nb_filters = 32 # size of pooling area for max pooling nb_pool = 2 # convolution kernel size nb_conv = 3 # the data, shuffled and split between tran and test sets # (X_train, y_train), (X_test, y_test) = mnist.load_data() # X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) # X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) # X_train = X_train.astype('float32') # X_test = X_test.astype('float32') # X_train /= 255 # X_test /= 255 # print('X_train shape:', X_train.shape) # print(X_train.shape[0], 'train samples') # print(X_test.shape[0], 'test samples') # print('log:') # print(y_test) # print('log end') X_train,Y_train=load_data("train.data") X_test,Y_test=load_data("test.data") # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(Y_train, nb_classes) Y_test = np_utils.to_categorical(Y_test, nb_classes) # Y_test = np_utils.to_categorical(y_test, nb_classes) # print(y_test) # print('log:') # print(X_test[1])= # print(Y_test) # print('log end') model = Sequential() model.add(Convolution2D(nb_filters, nb_conv, nb_conv, border_mode='valid', input_shape=(1, img_rows, img_cols))) model.add(Activation('relu')) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta') model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, verbose=1, validation_data=(X_test, Y_test)) return model
from __future__ import print_function import numpy as np np.random.seed(1337) # for reproducibility from keras.datasets import mnist from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution2D, MaxPooling2D from keras.utils import np_utils from read import load_data from cnn import training model=training() X_test,Y_test=load_data("test.data") batch_size = 1 test=[] test.append(X_test[0]) print("warning:") print(test) print("warning2:") print(X_test) out=model.predict(X_test,batch_size=batch_size,verbose=1) print('out=',out)
from dateutil.parser import parse from datetime import datetime import read df = read.load_data() def get_hour(time_str): parse(time_str)
import read from collections import Counter stories = read.load_data() l = [] headline = stories["headline"] for head in headline: l.append(head) l2 = [] for s in l: if type(s) == float: l2.append(str(s)) else: l2.append(s) s = ' '.join(l2) slist = s.split(" ") slist_lower = [] for s in slist: slist_lower.append(s.lower()) counts = Counter(slist_lower) print(counts.most_common(100))
""" This script looks for words appear most often in the headlines """ import read from collections import Counter data = read.load_data() #Makes sure all elemnts in data['headline'] are strings data['headline'] = data['headline'].astype(str) #This moves the elements in data['headline'] to a list string_lists = data['headline'] #This then joins the strings to make one long string long_string = " ".join(string_lists) #This makes all the elementsin the string lowercase long_string = long_string.casefold() #This splits the long string into individual words words = long_string.split(' ') #This counts the 100 words that occur the most in the data word_count = Counter(words).most_common(100) print(word_count)
from read import load_data import pandas as pd from collections import Counter df = load_data() #Remove missing value in headline headline = df['headline'].dropna() #Combine all of the headlines into one long string headline_full = ' '.join(headline.tolist()) #Remove special characters special_chars = "!@#$%^&*()+-?:|[]'\"_" for char in special_chars: headline_full = headline_full.replace(char, "") headline_full = headline_full.lower().split() #Count words in headline c = Counter(headline_full) if __name__ == "__main__": print(c.most_common(100))
"--facts_test", required=True, help="file containing facts for testing") parser.add_argument("-t", "--target", required=True, help="the target predicate") cl_args = parser.parse_args() background_fname = cl_args.background facts_fname_train = cl_args.facts_train facts_fname_test = cl_args.facts_test target = cl_args.target dfs_train = load_metadata(background_fname) load_data(facts_fname_train, dfs_train) print("done reading (" + str(time.time() - begtime) + ")") attr_name = None labels_df_train = None relations_train = [] rel_names_train = [] for name, df in dfs_train.items(): colnames = df.columns.values.tolist() attr_name = colnames[0] df.columns = [attr_name + "0", attr_name + "1"] if target == name: labels_df_train = df else: rel_names_train.append(name)
import read import pandas as pd df = read.load_data("hn_stories.csv") long_string = "" space = " " for idx, row in df.iterrows(): if idx == 0: long_string = str(row["headline"]) else: long_string = long_string + str(row["headline"]) + space list_str = long_string.split(" ") dict_str = {} for P in list_str: if P in dict_str: dict_str[P] = dict_str[P] + 1 else: dict_str[P] = 1 ser_word = pd.Series(dict_str, name="count") ser_word.index.name = "word" ser_word.reset_index() ser_word.sort_values(ascending=False, inplace=True) print(ser_word.head(10))
#!/usr/bin/env python """ Which words appears most often in the headlines? """ import collections from read import load_data # Read in the Hacker News dataset hn = load_data() # Combine all of the headlines into one long string all_headlines = "" for headline in hn['headline']: all_headlines += str(headline).lower() + " " # Split the long string into words all_words = all_headlines.split() # Count up how many times each word occurs counts = collections.Counter(all_words) # Print the 100 words that occur the most print(counts.most_common(100))
import read import pandas as pd import collections as cl data2 = read.load_data() loopcount = 0 for name, row in data2['url'].value_counts().items(): loopcount += 1 print('{} {}:{}'.format(loopcount, name, row)) if loopcount == 100: break
import read import collections hn_stories = read.load_data() domains = hn_stories["url"] domains2 = domains.tolist() nosubdomains = [] for i in domains2: i = str(i) cnt = i.count(".") if cnt >= 2: newstr = i.split(".", 1)[1] nosubdomains.append(newstr) else: nosubdomains.append(i) #print(collections.Counter(nosubdomains).most_common(100)) for name, row in domains.items(): print("{0}: {1}".format(name, row))