def parseJSON(): data = util.loadJSON(constants.JSON_FILE) data_index = [] for obj in data: data_dict = dict() temp_text = util.removePunctuation(str(data[obj]['text'])) stopped_temp_text = util.removeStopWords(temp_text, constants.STOP_LIST) temp_length = len(temp_text.split(" ")) data_dict['text'] = temp_text.lower() data_dict['doc_length'] = temp_length data_dict['doc_length_stopped'] = len(stopped_temp_text.split(" ")) meta_data = { "index": { "_index": constants.INDEX_NAME, "_type": constants.TYPE_NAME, "_id": str(obj) } } data_index.append(meta_data) data_index.append(data_dict) print "Complete JSON parsed..." return data_index
def createMetaData(review_data, userListMap): globalBusinessMap = {} for review in review_data: userID = review['user_id'] if userID in userListMap: reviewID = str(review['review_id']) businessID = str(review['business_id']) if businessID not in userListMap[userID]['restaurantMap']: userListMap[userID]['restaurantMap'][businessID] = [] userListMap[userID]['restaurantMap'][businessID].append(reviewID) userListMap[userID]['reviewMap'][reviewID] = (review['stars'], review['date']) if businessID not in globalBusinessMap.keys(): globalBusinessMap[businessID] = [userID] #print 'business added!' elif userID not in globalBusinessMap[businessID]: globalBusinessMap[businessID].append(userID) #print 'user added!' business_data = util.loadJSON('../yelp/restaurants.JSON') for user_id, user in userListMap.items(): # print user restaurantSet = set(user['restaurantMap'].keys()) # print restaurantSet calculateAttrPref(user, restaurantSet, business_data) user_map_file = open('userMap.json', 'w') json.dump(userListMap, user_map_file, indent=4) user_map_file.close() business_map_file = open('globalBusinessMap.json', 'w') json.dump(globalBusinessMap, business_map_file, indent=4) business_map_file.close()
def getModel(num_categories, split, get_names=False): bucket_width = 4.0 / num_categories buckets = [1 + bucket_width * (i + 1) for i in range(num_categories - 1)] buckets.append(5) if get_names: train_inputs, train_labels, ingredient_list = loadJSON(input_file, split, buckets=buckets, get_names=True) model = train(train_inputs, train_labels, num_categories) return model, ingredient_list, buckets, bucket_width train_inputs, train_labels, test_inputs, test_labels = loadJSON( input_file, split, buckets=buckets) model = train(train_inputs, train_labels, num_categories) return model, test_inputs, test_labels
def downloadData(param: Param, download: bool = True): '''Download user data (if {download} is True) to json files, merge them into a flat pandas.DataFrame, and write it to disk.''' logging.info(f"{param.filePath().name.replace('.','|')}") if download: subMethod = param.splitMethod(lower=True) for f in param.filePath(glob='*json'): f.unlink() pbarManager = enlighten.get_manager() with pbarManager.counter(unit='page', leave=False) as pbar: while param.page <= param.nPages: fileName = param.filePath(ext=f'.{param.page:04d}.json') response = getReq(param=param, pbarManager=pbarManager, collapse=False) param.page = int( response.get(subMethod).get('@attr').get('page')) param.nPages = int( response.get(subMethod).get('@attr').get('totalPages')) pbar.total = param.nPages # [tqdm: update total without resetting time elapsed](https://stackoverflow.com/a/58961015/13019084) pbar.update() param.filePath().parent.mkdir(exist_ok=True) with open(file=fileName, mode='w') as jsonF: json.dump(obj=response, fp=jsonF) param.page += 1 time.sleep(param.sleep) pbarManager.stop() DF = loadJSON(param) df = flattenDF(param=param, DF=DF, writeToDisk=True) if param.splitMethod() in ['TopArtists', 'TopAlbums', 'TopTracks']: writeCSV(param=param, df=df)
def createFoodNetworkEdgeLists(userMapFile, reviewJSONFile, thresholdJaccard, thresholdRating, thresholdAttr, numUsers): userListMap = pickle.load( open( userMapFile, "rb" ) ) review_data = util.loadJSON(reviewJSONFile) # 1) add meta-data for userID in userListMap.keys(): userListMap[userID]['restaurantMap'] = {} # create empty restaurantMap for each user userListMap[userID]['reviewMap'] = {} # create empty reviewMap for each user createMetaData(review_data, userListMap) print 'number of users', len(userListMap) print userListMap # 2) calculate scores for each edge jaccardVals = createEdgeVals(userListMap, True, False, False) attrVals = createEdgeVals(userListMap, False, True, False) ratiVals = createEdgeVals(userListMap, False, False, True) # 3) create food network for each score type jaccardNtwkFile = 'food_ntwk_random/jacc_edge_list_{}users_{}.txt'.format(numUsers, thresholdJaccard) ratiNtwkFile = 'food_ntwk_random/rati_edge_list_{}users_{}.txt'.format(numUsers, thresholdAttr) attrNtwkFile = 'food_ntwk_random/attr_edge_list_{}users_{}.txt'.format(numUsers, thresholdRating) createFoodNetwork(jaccardVals, userListMap, jaccardNtwkFile, thresholdJaccard) createFoodNetwork(ratiVals, userListMap, ratiNtwkFile, thresholdRating) createFoodNetwork(attrVals, userListMap, attrNtwkFile, thresholdAttr) # 4) check if valid edge list and print info about each network g = snap.LoadEdgeList(snap.PUNGraph, jaccardNtwkFile, 0, 1) print 'Jaccard Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges()) g = snap.LoadEdgeList(snap.PUNGraph, attrNtwkFile, 0, 1) print 'Attribute Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges()) g = snap.LoadEdgeList(snap.PUNGraph, ratiNtwkFile, 0, 1) print 'Rating Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges())
def get_email(query): names = util.loadJSON("contacts.json") toField = None for x in range(len(names)): if names[x]["name"] in query: toField = names[x]["email"] break return toField
def createUsersToBizMap(): review_data = util.loadJSON('../yelp/review.json') restaurantMap = load_data('../yelp/restaurants.json') UsersToBizMap = {} # userID: set(bizID1, bizID2, ...) --> maps every user that's been to every restaurant for review in review_data: review_userID = review['user_id'] businessID = str(review['business_id']) if businessID not in restaurantMap: continue if businessID not in UsersToBizMap[businessID]: UsersToBizMap[review_userID] = set() UsersToBizMap[review_userID].add(businessID) return UsersToBizMap
def writeEmbeddingFiles(): if ".txt" in input_file: train_inputs, tokens = loadTxt(input_path_prefix + input_file) elif ".json" in input_file: train_inputs, tokens = loadJSON(input_path_prefix + input_file) embeddings = train(train_inputs) stem = input_file[:input_file.find('.')] np.savetxt(result_path_prefix + stem + "_embeddings_%d" % dim, embeddings) fn = result_path_prefix + stem + "_tokens" if not os.path.isfile(fn): f = open(fn, 'w+') f.write(str(tokens)) f.close()
def writeEmbeddingFiles(dims): if ".txt" in input_file: train_inputs, tokens = loadTxt(input_path_prefix + input_file) elif ".json" in input_file: train_inputs, tokens = loadJSON(input_path_prefix + input_file) for hidden_size in dims: embeddings = trainCBOW(train_inputs, hidden_size=hidden_size) stem = input_file[:input_file.find('.')] np.savetxt(result_path_prefix + stem + "_CBOWembeddings_%d_%depochs" %(hidden_size, max_epochs), embeddings) fn = result_path_prefix + stem + "_tokens" if not os.path.isfile(fn): f = open(fn, 'w+') f.write(str(tokens)) f.close() return embeddings, tokens
def write_edges_file(user_data, user_map, edgesFile, edgesWithScoreFile): # UsersToBizMap = createUsersToBizMap() for userID in user_map.keys(): user_map[userID]['restaurantMap'] = {} user_map[userID]['reviewMap'] = {} createMetaData(util.loadJSON('../yelp/review.json'), user_map) friend_list = [] file = open(edgesFile, "w") file2 = open(edgesWithScoreFile, "w") scoreMap = {} count = 0 for i in xrange(len(user_data)): friend_list.append(user_data[i]['friends']) # print friend_list[i] for j in xrange(len(friend_list[i])): if friend_list[i][j] in user_map.keys(): pair = (user_data[i]['user_id'], friend_list[i][j]) pair2 = (friend_list[i][j], user_data[i]['user_id']) if pair in scoreMap: score = scoreMap[pair] elif pair2 in scoreMap: score = scoreMap[pair2] else: score = getRatingSimScore(friend_list[i][j], user_data[i]['user_id'], user_map) # if user should be in the network if score > 0: src_tmp = user_map[user_data[i]['user_id']] # not quite sure what data type this return value is, but it's converted to string dst_tmp = user_map[friend_list[i][j]] # not quite sure what data type this return value is, but it's converted to string src = src_tmp['node_id'] # extract node_id for src of edge dst = dst_tmp['node_id'] # src = str(src_tmp)[12:-1] # extract node_id for src of edge # dst = str(dst_tmp)[12:-1] # extract node_id for dst of edge line = "{0} {1}\n".format(src, dst) lineWithScore = "{0} {1} {2}\n".format(src, dst, score) file.write(line) file2.write(lineWithScore) count += 1 print 'num edges', count file.close() file2.close()
def process(): global state global ins_stack global com_stack global out_stack result = "" query = request.args.get("query").lower() if state == 0: command = None funcs = util.loadJSON("cmd.json") for f in funcs: if funcs[f]["trigger"] in query: command = funcs[f] break if command == None: if query == "": return "Hello." elif "reset" in query: util.writeJSON("cmd.json", (util.loadJSON("backup.json"))) micro.micro_states = {} else: return "I couldn't understand." else: ins_stack.append(command["id"]) if (command["internal"] == True): state = 1 else: x = 0 t_i_s = [ins_stack.pop()] t_c_s = [] while x < len(t_i_s): while internal(t_i_s[x]) == False: tuple = breakdown(t_i_s, x, funcs) t_i_s = tuple[0] + t_i_s t_c_s = tuple[1] + t_c_s x += 1 ins_stack = t_i_s com_stack = t_c_s state = 2 if state == 1: feedback = call(ins_stack[0], query, out_stack) if (feedback[1] == True): ins_stack.pop(0) state = 0 out_stack.append(feedback[0]) result = feedback[0] elif state == 2: feedback = call(ins_stack[0], com_stack[0], out_stack) while (feedback[1] == True and len(ins_stack) > 0): ins_stack.pop(0) com_stack.pop(0) out_stack.append(feedback[0]) if len(ins_stack) > 0: feedback = call(ins_stack[0], com_stack[0], out_stack) result = feedback[0] if (len(ins_stack) == 0 and feedback[1] == True): state = 0 return result
def teach_command(command_string, out_stack): print("DEBUG" + command_string) frame = inspect.currentframe() state_key = str(inspect.getframeinfo(frame).function) DEFAULT_STATE = (None, []) if state_key not in micro_states: m1 = re.search("(.+)called (.+)", command_string) if m1 is None: micro_states[state_key] = DEFAULT_STATE return "What should I call this?", False else: command_name = m1.group(2) micro_states[state_key] = (command_name, []) cmds = util.loadJSON("cmd.json") conflict = False for x in cmds: x_trigger = cmds[x]["trigger"] if x_trigger in command_name or command_name in x_trigger: conflict = True if conflict: del micro_states[state_key] return "Invalid command name.", True return "How do I do this?", False else: if "cancel" in command_string: del micro_states[state_key] return "Teaching cancelled", True current_state = micro_states[state_key] if current_state[0] is None: command_name = command_string cmds = util.loadJSON("cmd.json") conflict = False for x in cmds: x_trigger = cmds[x]["trigger"] if x_trigger in command_name or command_name in x_trigger: conflict = True if conflict: del micro_states[state_key] return "Invalid command name.", True micro_states[state_key] = (command_name, []) return "How do I do this?", False else: if "finish" in command_string: if len(micro_states[state_key][1]) > 0: seq_id = [i[0] for i in micro_states[state_key][1]] seq_str = [i[1] for i in micro_states[state_key][1]] util.saveFunction(micro_states[state_key][0], seq_id, seq_str) del micro_states[state_key] return "New command saved", True else: del micro_states[state_key] return "Incorrect number of steps", True else: command_step = command_string cmds = util.loadJSON("cmd.json") step_id = None step_str = command_string for x in cmds: x_trigger = cmds[x]["trigger"] if x_trigger in command_step: if x_trigger == "teach": continue x_id = cmds[x]["id"] step_id = x_id break if step_id is None: return "Sorry. I didn't get that", False else: micro_states[state_key][1].append((step_id, step_str)) return "OK. What next?", False
import cv2 import numpy as np import util as u #undistort het camerabeeld met de opgeslagen matrix in: distortionData = u.loadJSON( r"C:\Users\reuli\Documents\Bachelor Assignment\Robot project\python\calibration\calibration" ) #undistort het camerabeeld gray def undistort(gray): dst = cv2.undistort(gray, np.float32(distortionData["cameraMatrix"]), np.float32(distortionData["distCoeffs"]), None, np.float32(distortionData["newCameraMatrix"])) x, y, w, h = distortionData["validPixROI"] dst = dst[y:y + h, x:x + w] return dst
import cv2 import numpy as np import util as u transformData = u.loadJSON( r"C:\Users\reuli\Documents\Bachelor Assignment\Robot project\python\calibration\transformationMatrix" ) #perspective transforms image def transform(gray): warped = cv2.warpPerspective( gray, np.float32(transformData["transform_matrix"]), (transformData["maxWidth"], transformData["maxHeight"])) return warped
from __future__ import division from elasticsearch import Elasticsearch import util, es_utility, math import constants, search QUERY = es_utility.readQueryFile("query_desc.51-100.short.txt") qno = [85, 59, 56, 71, 64, 62, 93, 99, 58, 77, 54, 87, 94, 100, 89, 61, 95, 68, 57, 97, 98, 60, 80, 63, 91] QUERY_MAP = zip([str(q) for q in qno], QUERY) expanded_terms = util.loadJSON('prf_data.json') def saveResults(score_fuction): filename = score_fuction + "_prf_results.txt" with open(filename, 'wb') as f: for n, q in QUERY_MAP: stemmed_terms = es_utility.getRootQuery(" ".join(q)) new_terms = [t for t in expanded_terms[n] if t not in stemmed_terms] root = stemmed_terms + new_terms if score_fuction == "okapi": body = search.OKAPI_BODY(root) elif score_fuction == "tfidf": body = search.TFIDF_BODY(root) elif score_fuction == "bm25": body = search.BM25_BODY(root) elif score_fuction == "lm": body = search.LM_BODY(root) elif score_fuction == 'jm': body = search.JM_BODY(root) else:
if __name__ == '__main__': def doRegression(): param_list = train(data_train, labels_train) best_epoch, best_loss = test_bunch(data_test, labels_test, param_list) print "Best epoch:", best_epoch best_params = param_list[best_epoch] test(data_dev, labels_dev, best_params) test(data_train, labels_train, best_params, fn="train_preds") def doClassification(): param_list = trainBuckets(data_train, labels_train) testBuckets(data_test, labels_test, param_list) # See util.py for explanation of loading process. # [0.6, 0.8] means 0.6 train, 0.2 dev, 0.2 test. all_data = list(loadJSON(input_file, [0.6, 0.8])) # Optional; spreads data out to approximate a uniform distribution. # mus/sigmas are needed if wanting to renormalize (see fn above). mus, sigmas = undoLabelNormalization(all_data) data_train, labels_train, data_dev, labels_dev, data_test, labels_test = all_data print "Train size:", data_train.shape[0] print "Test size:", data_test.shape[0] doRegression() # doClassification()
import util, es_utility, glob from elasticsearch import Elasticsearch ## For reading the documents ADVANCED_PRE_PROCESSING = False STREAM = False ## global values JSON_FILE = "AP_DATA_FULL.json" GLOBAL_JSON_FILE = "crawler.json" ES_HOST = {"host": "localhost", "port": 9210} ##INDEX_NAME = 'ap_streamed_none' INDEX_NAME = 'backup_vs' TYPE_NAME = 'document' ANALYZER_NAME = 'my_english' ##QREL_FILENAME = 'qrels.adhoc.51-100.AP89.txt' QREL_FILENAME = 'QREL.txt' STOP_LIST = util.getStopList("stoplist.txt") DOC_LIST = util.getDocList('doclist.txt') INDEX_CONSTANTS = util.loadJSON(GLOBAL_JSON_FILE) CORPUS = glob.glob("ap89_collection/ap*") RESULT_SIZE = 10 ES_CLIENT = Elasticsearch(hosts=[ES_HOST], timeout=180) ##print ES_CLIENT.count(index=INDEX_NAME)['count']