import json from pprint import pprint from tag import count_words from sets import Set with open('data.json') as data_file: data = json.load(data_file) # pprint([x for x in data if x['winner'] == True]) # print(len(data)) # Win vector win_vector = [1 if x['winner'] else 0 for x in data] print(win_vector) count_vectors = [count_words(x['description']) for x in data] for x in count_vectors: sumo = sum([y[1] for y in x]) for i in range(0,len(x)): if sumo != 0: x[i] = (x[i][0], float(x[i][1]) / sumo) print(count_vectors) words = Set() for x in count_vector: for word_tuple in x: words.add(word_tuple[0])
def predict(json): # with open('projects_with_users.json') as data_file: # data = json.load(data_file) # pprint([x for x in data if x['winner'] == True]) # print(len(data)) #random.shuffle(data) # Win vector #win_vector = [1 if x['winner'] else 0 for x in data] #print(win_vector) #data = data[:100] # data2 = [] # for proj in data: # if proj['winner']: # data # data = [y for y in data if y['winner'] == True] # print(len(data)) # cv_win = [1 if x['winner'] else 0 for x in data] data = [json] count_vectors = [count_words(x['description']) for x in data] pkl_file = open(os.path.join(os.path.dirname(__file__),'words.pkl'), 'rb') words = pickle.load(pkl_file, encoding='latin1') pkl_file.close() # maxOcc = [ ] # minOcc = [ ] # for j in range(0, len(words) + 3): # maxOcc.append(0) # minOcc.append(0) matrix = [ ] for i in range(0, len(count_vectors)): row = [ ] for j in range(0, len(words)): cnt = 0 for item in count_vectors[i]: if words[j] == item[0]: cnt = item[1] row.append(cnt) # if cnt > maxOcc[j]: # maxOcc[j] = cnt # if cnt < minOcc[j]: # minOcc[j] = cnt matrix.append(row) row = 0 for element in data: nr_wins = 0 nr_projects = 0 nr_hacks = 0 for author in element["authors"]: nr_wins = nr_wins + author["wins"] nr_projects = nr_projects + author["projects"] nr_hacks = nr_hacks + author["hackathons"] if nr_projects == 0: matrix[row].append(0) else: matrix[row].append(float(nr_wins) / nr_projects) matrix[row].append(nr_wins) matrix[row].append(nr_hacks) row = row + 1 pkl_file = open(os.path.join(os.path.dirname(__file__),'maxOcc.pkl'), 'rb') maxOcc = pickle.load(pkl_file, encoding='latin1') pkl_file.close() for i in range(0, len(matrix)): for j in range(0, len(matrix[i])): matrix[i][j] = float(matrix[i][j]) / maxOcc[j] win_vector = np.loadtxt(os.path.join(os.path.dirname(__file__),'win_vector.txt')).tolist() #n_matrix = np.array(matrix, size=[100,len(matrix[0])]) # print(len(matrix[0]) == len(matrix[1])) # print(len(win_vector)) #print(n_matrix[0]) #n_win_vector = np.int64(win_vector) # print(win_vector[0]) #print(len(a),len(win_vector)) # x,y = a, win_vector # clf = svm.SVC(gamma=0.001, C=100) # clf.fit(x,y) # joblib.dump(clf, 'clf.pkl') hits = 0; hits_0 = 0; return (clf.predict([matrix[0]])[0] == 1.0)
def predict(json): # with open('projects_with_users.json') as data_file: # data = json.load(data_file) # pprint([x for x in data if x['winner'] == True]) # print(len(data)) #random.shuffle(data) # Win vector #win_vector = [1 if x['winner'] else 0 for x in data] #print(win_vector) #data = data[:100] # data2 = [] # for proj in data: # if proj['winner']: # data # data = [y for y in data if y['winner'] == True] # print(len(data)) # cv_win = [1 if x['winner'] else 0 for x in data] data = [json] count_vectors = [count_words(x['description']) for x in data] pkl_file = open(os.path.join(os.path.dirname(__file__), 'words.pkl'), 'rb') words = pickle.load(pkl_file, encoding='latin1') pkl_file.close() # maxOcc = [ ] # minOcc = [ ] # for j in range(0, len(words) + 3): # maxOcc.append(0) # minOcc.append(0) matrix = [] for i in range(0, len(count_vectors)): row = [] for j in range(0, len(words)): cnt = 0 for item in count_vectors[i]: if words[j] == item[0]: cnt = item[1] row.append(cnt) # if cnt > maxOcc[j]: # maxOcc[j] = cnt # if cnt < minOcc[j]: # minOcc[j] = cnt matrix.append(row) row = 0 for element in data: nr_wins = 0 nr_projects = 0 nr_hacks = 0 for author in element["authors"]: nr_wins = nr_wins + author["wins"] nr_projects = nr_projects + author["projects"] nr_hacks = nr_hacks + author["hackathons"] if nr_projects == 0: matrix[row].append(0) else: matrix[row].append(float(nr_wins) / nr_projects) matrix[row].append(nr_wins) matrix[row].append(nr_hacks) row = row + 1 pkl_file = open(os.path.join(os.path.dirname(__file__), 'maxOcc.pkl'), 'rb') maxOcc = pickle.load(pkl_file, encoding='latin1') pkl_file.close() for i in range(0, len(matrix)): for j in range(0, len(matrix[i])): matrix[i][j] = float(matrix[i][j]) / maxOcc[j] win_vector = np.loadtxt( os.path.join(os.path.dirname(__file__), 'win_vector.txt')).tolist() #n_matrix = np.array(matrix, size=[100,len(matrix[0])]) # print(len(matrix[0]) == len(matrix[1])) # print(len(win_vector)) #print(n_matrix[0]) #n_win_vector = np.int64(win_vector) # print(win_vector[0]) #print(len(a),len(win_vector)) # x,y = a, win_vector # clf = svm.SVC(gamma=0.001, C=100) # clf.fit(x,y) # joblib.dump(clf, 'clf.pkl') hits = 0 hits_0 = 0 return (clf.predict([matrix[0]])[0] == 1.0)
from sklearn import svm import numpy as np from sklearn.externals import joblib with open('projects_with_users.json') as data_file: data = json.load(data_file) # pprint([x for x in data if x['winner'] == True]) # print(len(data)) #random.shuffle(data) # Win vector #win_vector = [1 if x['winner'] else 0 for x in data] #print(win_vector) count_vectors = [count_words(x['description']) for x in data] words = [ ] for x in count_vectors: for pair in x: if words.count(pair[0]) == 0: words.append(pair[0]) maxOcc = [ ] minOcc = [ ] for j in range(0, len(words) + 3): maxOcc.append(0) minOcc.append(0) matrix = [ ] for i in range(0, len(count_vectors)):