def knn_visual(k, locationid, model, database, this_vector=None): nearest = {} if not this_vector: this_vector = Vectorizer.visual_vector(locationid, database, model) others = database.get_location_ids() others.remove(locationid) # Get distance to each other vecctor and add to nearest if it is less than the # distance to an existing vector. for other in others: other_vector = Vectorizer.visual_vector(other, database, model) distance = Distance.l_p_distance(3, this_vector, other_vector, positional=True) if len(nearest) < k: largest_key, largest_best = None, inf else: largest_key, largest_best = max(nearest.items(), key=itemgetter(1)) if distance < largest_best: if largest_key: nearest.pop(largest_key) nearest[other] = distance return nearest
def knn_visual_all(k, locationid, models, database, this_vector=None): nearest = {} others = database.get_location_ids() others.remove(locationid) # Calculate this_vector if not this_vector: this_vector = Vectorizer.visual_vector_multimodel( locationid, database, models) # Make a vector where each item is an average for that model for other in others: other_vector = Vectorizer.visual_vector_multimodel( locationid, database, models) # get distance between vectors distance = Distance.l_p_distance(3, this_vector, other_vector) if len(nearest) < k: largest_key, largest_best = None, inf else: largest_key, largest_best = max(nearest.items(), key=itemgetter(1)) if distance < largest_best: if largest_key: nearest.pop(largest_key) nearest[other] = distance return nearest
def nearest_visual(self, *args): """ arg[0] = locationid arg[1] = model (CM, CM3x3, CN, CN3x3, CSD, GLRLM, GLRLM3x3, HOG, LBP, LBP3x3, ALL) arg[2] = k """ if not self.database: print("[ERROR] The database must be loaded for this action.") return if not len(args) == 3: print("[ERROR] Expected three arguments but got " + str(len(args)) + ".") print("\targ = " + str(args)) # Get the first argument try: k = int(args[2]) except: print("[ERROR] K Value provided is invalid.") print("\tk = " + str(args[2])) return # Get the model to use. We do this before the item as it is easier # to differentiate valid from invalid model = args[1] if not model in self.valid_vis_models: print("[ERROR] Model Type value provided was invalid.") print("\tModel Type = " + str(args[1])) return try: locationid = int(args[0]) except: print("[ERROR] The ID specified was not valid") print("\tID = " + str(locationid) + "; Model = " + model) return start = time() if model == 'ALL': this_vector = Vectorizer.visual_vector_multimodel(locationid, self.database, self.valid_vis_models) nearest = Neighbor.knn_visual_all(k, locationid, self.valid_vis_models, self.database, this_vector = this_vector) else: # get vector representing item associated w/ id this_vector = Vectorizer.visual_vector(locationid, self.database, model) nearest = Neighbor.knn_visual(k, locationid, model, self.database, this_vector=this_vector) contribs = Neighbor.visual_sim_contribution(this_vector, nearest.keys(), self.database, model, k=3) print("The Visual Descriptor Nearest Neighbor took " + str(time() - start)) print(str(k) + " Nearest Neighbors:") for i, (an_id, distance) in enumerate(nearest.items()): print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance)) print('Top 3 Image Pairs:') for i, item in enumerate(contribs): print('\t' + str(i) + '. ' + str(item))
class Graph: """ The graph of nodes and edges ... Attributes ---------- nodes : List[Node] List of nodes in the graph num_of_nodes : int Total number of nodes num_of_edges : List[int] List of number of edges of nodes vectorizer : Vectorizer Vectorizer object that return vector representation of entities Methods ------- add_node(node) Adds a node to graph create_graph() Creates all edges between nodes based on cosine-similarity """ def __init__(self, term_dict: Dict[str, Term]): self.nodes = [] self.num_of_nodes = 0 self.num_of_edges = [] self.vectorizer = Vectorizer(term_dict) def add_node(self, node: Node): self.nodes.append(node) self.num_of_nodes += 1 def create_graph(self): if type(self.nodes[0].body) == Document: self.vectorizer.vectorize_docs([node.body for node in self.nodes]) else: self.vectorizer.vectorize_sentences( [node.body for node in self.nodes]) self.num_of_edges = [1 for i in range(self.num_of_nodes)] for i in range(self.num_of_nodes): for j in range(i + 1, self.num_of_nodes): node1 = self.nodes[i] node2 = self.nodes[j] similarity = cos_sim(node1.body.vector, node2.body.vector) if similarity > 0.10: node1.add_edge(Edge(node2.id, similarity)) node2.add_edge(Edge(node1.id, similarity)) self.num_of_edges[i] += 1 self.num_of_edges[j] += 1
def knn_textual(k, an_id, model, item_type, database, this_vector=None): nearest = {} if not this_vector: this_vector = Vectorizer.text_vector(an_id, database, model, item_type) # Get the id's for all items of the same type. if item_type == 'photo': an_id = int(an_id) others = database.get_photo_ids() elif item_type == 'user': others = database.get_user_ids() elif item_type == 'poi': an_id = int(an_id) others = database.get_location_ids() else: raise ValueError( '[ERROR] The provided type was invalid.\ntype = ' + str(item_type)) # remove this id from the list others.remove(an_id) # for each other one, get their vector and calculate distance. for other in others: other_vector = Vectorizer.text_vector(other, database, model, item_type) if len(other_vector ) == 0: # Find any elements with no textual descriptors continue distance = Distance.l_p_distance(3, this_vector, other_vector) if len(nearest) < k: largest_key, largest_best = None, inf else: largest_key, largest_best = max(nearest.items(), key=itemgetter(1)) if distance < largest_best: # remove the key with the largest distance if it exists if largest_key: nearest.pop(largest_key) nearest[other] = distance if all([value == 0 for value in nearest.values()]): break # Return your K nearest return nearest
def nearest_text(self, *args): """ arg[0] = type (user, photo, poi) arg[1] = id arg[2] = model (tf, df, tf-idf) arg[3] = k """ if not self.database: print("[ERROR] The database must be loaded for this action.") return if not len(args) is 4: print("Nearest Text expected 4 arguments but got " + str(len(args)) + ".") print("\targs = " + str(args)) return # Get the first argument try: k = int(args[3]) except: print("[ERROR] K Value provided is invalid.") print("\tk = " + str(args[0])) return # Get the type of item we are considering. itype = args[0] if not itype in self.valid_types: print("[ERROR] Item Type value provided was invalid.") print("\tItem Type = " + str(args[1])) return # Get the model to use. We do this before the item as it is easier # to differentiate valid from invalid model = args[2] model = model.lower() if not model in self.valid_txt_models: print("[ERROR] Model Type value provided was invalid.") print("\tModel Type = " + str(args[2])) return try: # get vector representing item associated w/ id an_id = args[1] this_vector = Vectorizer.text_vector(an_id, self.database, model, itype) except: print("[ERROR] The ID specified was not found in the dataset.") print("\tID = " + an_id + "; Type = " + itype + "; Model = " + model) return nearest = Neighbor.knn_textual(k, an_id, model, itype, self.database, this_vector=this_vector) contribs = Neighbor.similarity_by_id(this_vector, nearest.keys(), self.database, model, itype) print(str(k) + " Nearest Neighbors:") for i, (an_id, distance) in enumerate(nearest.items()): print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance)) print('Top 3 Features:') for i, item in enumerate(contribs): print('\t' + str(i) + '. ' + str(item))
def visual_sim_contribution(this_vector, ids, database, model, k=3): other_vectors = [ Vectorizer.visual_vector(locid, database, model) for locid in ids ] return Neighbor.similarity_contribution(this_vector, other_vectors, k, positional=True)
class Summarizer(): def __init__(self, w2v_path=r".\data\vi.vec"): self.clearner = Preprocessor() self.vectorizer = Vectorizer(w2v_path) def summarize(self, paragraph, mode="clustering", keep_sentences=5): origin_sentence = sent_tokenize(paragraph) sentences = self.clearner.preprocessing(paragraph) sent_vectors = self.vectorizer.vectorize(sentences) # row vector if mode == "clustering": kmeans = KMeans(n_clusters=keep_sentences) kmeans = kmeans.fit(sent_vectors) avg = [] for j in range(keep_sentences): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, sent_vectors) # top_sentences = sorted(range(n_clusters), key=lambda k: avg[k]) top_sentences = sorted(closest) elif mode == "lsa": # input: column vector sent_vectors_t = sent_vectors.T U, S, VT = np.linalg.svd(sent_vectors_t) saliency_vec = np.dot(np.square(S), np.square(VT)) top_sentences = saliency_vec.argsort()[-keep_sentences:][::-1] top_sentences.sort() else: sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sent_vectors[i].reshape(1, -1), sent_vectors[j].reshape(1, -1))[0][0] nx_graph = nx.from_numpy_array(sim_mat) scores = list(nx.pagerank(nx_graph).values()) top_sentences = np.argsort(scores)[-keep_sentences:][::-1] top_sentences.sort() summary = " ".join([origin_sentence[i] for i in top_sentences]) return summary, top_sentences
def main(): global serial, MIN_Y, MAX_Y, MIN_X, MAX_X if not len(sys.argv) == 3: print 'Wrong args' sys.exit(0) RES_X = int(sys.argv[1]) if RES_X < 0: data = Rasterizer() points = data.get_lines(sys.argv[2], MIN_X, MAX_X, MIN_Y, MAX_Y) else: data = Vectorizer() points = data.get_polygons(sys.argv[2], RES_X, MIN_X, MAX_X, MIN_Y, MAX_Y) serial = Serial(SERIAL_PORT, BAUD) serial.flushInput() serial.flushOutput() print 'Waiting for MCU' # Wait until the mcu sends a byte, signaling it's ready serial.read(1) print 'Starting transmission' count = 1 for cur_p in points: next_x = cur_p[0] next_y = cur_p[1] next_z = cur_p[2] data = struct.pack('<ffb', next_x, next_y, next_z) send_wait_ack(data) print 'Sent point %d of %d\n' % (count, len(points)) count += 1 # Send end of transmission send_wait_ack(END_DATA) raw_input("press enter to continue")
import os from vectorize import Vectorizer DATA_DIR = "../../data/movie/polarity_dataset_v1.0/tokens" def inpYielder(): root = DATA_DIR neg_dir = os.path.join(root, "neg") for fileName in os.listdir(neg_dir): print fileName f = open(os.path.join(neg_dir, fileName), "r") yield 0, f.read() f.close() pos_dir = os.path.join(root, "pos") for fileName in os.listdir(pos_dir): print fileName f = open(os.path.join(pos_dir, fileName), "r") yield 1, f.read() f.close() vectorizer = Vectorizer(\ "Bo Pang's movie data", \ "Movie", \ "polarity_dataset_v1.0", \ ngram = 1,\ ) vectorizer.run(inpYielder)
def similarity_by_id(this_vector, ids, database, model, itemtype, k=3): other_vectors = [ Vectorizer.text_vector(an_id, database, model, itemtype) for an_id in ids ] return Neighbor.similarity_contribution(this_vector, other_vectors, k)
import os.path from csv import reader from vectorize import Vectorizer PATH = "../../data/movie/amazon" def score_review_pairs(): fi = reader(open(os.path.join(PATH, "processed_reviews.tsv"), "r"), delimiter="\t", quotechar="\"") for score, review in fi: yield int(score), review vectorizer = Vectorizer( "Amazon dvd", "Movie", "amazon", ngram=3, ) vectorizer.run(score_review_pairs)
def __init__(self, term_dict: Dict[str, Term]): self.nodes = [] self.num_of_nodes = 0 self.num_of_edges = [] self.vectorizer = Vectorizer(term_dict)
import os.path from csv import reader from vectorize import Vectorizer PATH = "../../data/movie/amazon" def score_review_pairs(): fi = reader( open(os.path.join(PATH, "processed_reviews.tsv"), "r"), delimiter="\t", quotechar="\"" ) for score, review in fi: yield int(score), review vectorizer = Vectorizer( "Amazon dvd", "Movie", "amazon", ngram=3, ) vectorizer.run(score_review_pairs)
def __init__(self, w2v_path=r".\data\vi.vec"): self.clearner = Preprocessor() self.vectorizer = Vectorizer(w2v_path)