import pandas as pd
from dateutil.parser import parse
from datetime import *
import read

hours = read.load_data()

def extract_hours(timestamp):
    convert = parse(timestamp)
    hour = convert.hour
    return hour

def extract_day(timestamp):
    convert = parse(timestamp)
    day = convert.weekday()
    return day

if __name__ == "__main__":
    hours["submission_hour"] = hours["submission_time"].apply(extract_hours)
    hours["submission_day"] = hours["submission_time"].apply(extract_day)
    print(hours["submission_hour"].value_counts())
    print(hours["submission_day"].value_counts())
    
import read



df = read.load_data()


print(df[column value].value_counts(ascending=False).head(100))
import pandas as pd
import read
from collections import Counter

data_count = read.load_data()

headlines_str = " "

for i in data_count["headline"]:
    add_space = str(i) + " "
    headlines_str += str(add_space)
    
headlines_str = headlines_str.lower()

words = headlines_str.split()

top_100 = Counter(words).most_common(100)


if __name__ == "__main__":
    print(top_100)
import pandas as pd
import read

domains = read.load_data()

top_20_domains = domains["url"].value_counts()[:20]

if __name__ == "__main__":
    print(top_20_domains)
Exemple #5
0
def training():
	batch_size = 1
	nb_classes = 4
	nb_epoch = 1

	# input image dimensions
	img_rows, img_cols = 10, 10
	# number of convolutional filters to use
	nb_filters = 32
	# size of pooling area for max pooling
	nb_pool = 2
	# convolution kernel size
	nb_conv = 3

	# the data, shuffled and split between tran and test sets
	# (X_train, y_train), (X_test, y_test) = mnist.load_data()

	# X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
	# X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
	# X_train = X_train.astype('float32')
	# X_test = X_test.astype('float32')
	# X_train /= 255
	# X_test /= 255
	# print('X_train shape:', X_train.shape)
	# print(X_train.shape[0], 'train samples')
	# print(X_test.shape[0], 'test samples')

	# print('log:')
	# print(y_test)
	# print('log end')

	X_train,Y_train=load_data("train.data")
	X_test,Y_test=load_data("test.data")

	# convert class vectors to binary class matrices
	Y_train = np_utils.to_categorical(Y_train, nb_classes)
	Y_test = np_utils.to_categorical(Y_test, nb_classes)
	# Y_test = np_utils.to_categorical(y_test, nb_classes)
	# print(y_test)

	# print('log:')
	# print(X_test[1])=
	# print(Y_test)
	# print('log end')

	model = Sequential()

	model.add(Convolution2D(nb_filters, nb_conv, nb_conv,
	                        border_mode='valid',
	                        input_shape=(1, img_rows, img_cols)))
	model.add(Activation('relu'))
	model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
	model.add(Activation('relu'))
	model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
	model.add(Dropout(0.25))

	model.add(Flatten())
	model.add(Dense(128))
	model.add(Activation('relu'))
	model.add(Dropout(0.5))
	model.add(Dense(nb_classes))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adadelta')

	model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
	          show_accuracy=True, verbose=1, validation_data=(X_test, Y_test))

	return model
Exemple #6
0
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from read import load_data
from cnn import training

model=training()
X_test,Y_test=load_data("test.data")
batch_size = 1
test=[]
test.append(X_test[0])
print("warning:")
print(test)
print("warning2:")
print(X_test)
out=model.predict(X_test,batch_size=batch_size,verbose=1)
print('out=',out)
Exemple #7
0
from dateutil.parser import parse
from datetime import datetime
import read

df = read.load_data()

def get_hour(time_str):
    parse(time_str)
    
    
    
import read
from collections import Counter
stories = read.load_data()
l = []
headline = stories["headline"]
for head in headline:
    l.append(head)
l2 = []
for s in l:
    if type(s) == float:
        l2.append(str(s))
    else:
        l2.append(s)
s = ' '.join(l2)
slist = s.split(" ")
slist_lower = []
for s in slist:
    slist_lower.append(s.lower())
counts = Counter(slist_lower)
print(counts.most_common(100))
Exemple #9
0
""" This script looks for words appear most often in the headlines 
"""
import read
from collections import Counter

data = read.load_data()

#Makes sure all elemnts in data['headline'] are strings
data['headline'] = data['headline'].astype(str)

#This moves the elements in data['headline'] to a list
string_lists = data['headline']

#This then joins the strings to make one long string
long_string = " ".join(string_lists)

#This makes all the elementsin the string lowercase
long_string = long_string.casefold()

#This splits the long string into individual words
words = long_string.split(' ')

#This counts the 100 words that occur the most in the data
word_count = Counter(words).most_common(100)

print(word_count)
Exemple #10
0
from read import load_data
import pandas as pd
from collections import Counter

df = load_data()

#Remove missing value in headline
headline = df['headline'].dropna()

#Combine all of the headlines into one long string
headline_full = ' '.join(headline.tolist())

#Remove special characters
special_chars = "!@#$%^&*()+-?:|[]'\"_"
for char in special_chars:
    headline_full = headline_full.replace(char, "")

headline_full = headline_full.lower().split()
     
#Count words in headline
c = Counter(headline_full)
if __name__ == "__main__":
    print(c.most_common(100))
Exemple #11
0
                        "--facts_test",
                        required=True,
                        help="file containing facts for testing")
    parser.add_argument("-t",
                        "--target",
                        required=True,
                        help="the target predicate")

    cl_args = parser.parse_args()
    background_fname = cl_args.background
    facts_fname_train = cl_args.facts_train
    facts_fname_test = cl_args.facts_test
    target = cl_args.target

    dfs_train = load_metadata(background_fname)
    load_data(facts_fname_train, dfs_train)
    print("done reading (" + str(time.time() - begtime) + ")")

    attr_name = None
    labels_df_train = None
    relations_train = []
    rel_names_train = []
    for name, df in dfs_train.items():
        colnames = df.columns.values.tolist()
        attr_name = colnames[0]
        df.columns = [attr_name + "0", attr_name + "1"]

        if target == name:
            labels_df_train = df
        else:
            rel_names_train.append(name)
Exemple #12
0
import read
import pandas as pd
df = read.load_data("hn_stories.csv")
long_string = ""
space = " "
for idx, row in df.iterrows():
    if idx == 0:
        long_string = str(row["headline"])
    else:
        long_string = long_string + str(row["headline"]) + space

list_str = long_string.split(" ")
dict_str = {}
for P in list_str:
    if P in dict_str:
        dict_str[P] = dict_str[P] + 1
    else:
        dict_str[P] = 1

ser_word = pd.Series(dict_str, name="count")
ser_word.index.name = "word"
ser_word.reset_index()
ser_word.sort_values(ascending=False, inplace=True)

print(ser_word.head(10))
Exemple #13
0
#!/usr/bin/env python
"""
Which words appears most often in the headlines?
"""
import collections

from read import load_data

# Read in the Hacker News dataset
hn = load_data()

# Combine all of the headlines into one long string
all_headlines = ""
for headline in hn['headline']:
    all_headlines += str(headline).lower() + " "

# Split the long string into words
all_words = all_headlines.split()

# Count up how many times each word occurs
counts = collections.Counter(all_words)

# Print the 100 words that occur the most
print(counts.most_common(100))
Exemple #14
0
import read
import pandas as pd
import collections as cl

data2 = read.load_data()
loopcount = 0
for name, row in data2['url'].value_counts().items():
    loopcount += 1
    print('{}  {}:{}'.format(loopcount, name, row))
    if loopcount == 100:
        break
Exemple #15
0
import read
import collections

hn_stories = read.load_data()
domains = hn_stories["url"]

domains2 = domains.tolist()
nosubdomains = []
for i in domains2:
    i = str(i)
    cnt = i.count(".")
    if cnt >= 2:
        newstr = i.split(".", 1)[1]
        nosubdomains.append(newstr)
    else:
        nosubdomains.append(i)
#print(collections.Counter(nosubdomains).most_common(100))
for name, row in domains.items():
    print("{0}: {1}".format(name, row))