Example #1
0
    def knn_visual(k, locationid, model, database, this_vector=None):
        nearest = {}
        if not this_vector:
            this_vector = Vectorizer.visual_vector(locationid, database, model)

        others = database.get_location_ids()
        others.remove(locationid)
        # Get distance to each other vecctor and add to nearest if it is less than the
        #   distance to an existing vector.
        for other in others:
            other_vector = Vectorizer.visual_vector(other, database, model)
            distance = Distance.l_p_distance(3,
                                             this_vector,
                                             other_vector,
                                             positional=True)

            if len(nearest) < k:
                largest_key, largest_best = None, inf
            else:
                largest_key, largest_best = max(nearest.items(),
                                                key=itemgetter(1))

            if distance < largest_best:
                if largest_key:
                    nearest.pop(largest_key)
                nearest[other] = distance

        return nearest
Example #2
0
    def knn_visual_all(k, locationid, models, database, this_vector=None):
        nearest = {}
        others = database.get_location_ids()
        others.remove(locationid)

        # Calculate this_vector
        if not this_vector:
            this_vector = Vectorizer.visual_vector_multimodel(
                locationid, database, models)

        # Make a vector where each item is an average for that model
        for other in others:
            other_vector = Vectorizer.visual_vector_multimodel(
                locationid, database, models)
            # get distance between vectors
            distance = Distance.l_p_distance(3, this_vector, other_vector)

            if len(nearest) < k:
                largest_key, largest_best = None, inf
            else:
                largest_key, largest_best = max(nearest.items(),
                                                key=itemgetter(1))

            if distance < largest_best:
                if largest_key:
                    nearest.pop(largest_key)
                nearest[other] = distance

        return nearest
    def nearest_visual(self, *args):
        """
            arg[0] = locationid
            arg[1] = model (CM, CM3x3, CN, CN3x3, CSD, GLRLM, GLRLM3x3, HOG, LBP, LBP3x3, ALL)
            arg[2] = k
        """
        
        if not self.database:
            print("[ERROR] The database must be loaded for this action.")
            return
        
        if not len(args) == 3:
            print("[ERROR] Expected three arguments but got " + str(len(args)) + ".")
            print("\targ = " + str(args))
        
        # Get the first argument
        try:
            k = int(args[2])
        except:
            print("[ERROR] K Value provided is invalid.")
            print("\tk = " + str(args[2]))
            return

        # Get the model to use. We do this before the item as it is easier
        #   to differentiate valid from invalid
        model = args[1]
        if not model in self.valid_vis_models:
            print("[ERROR] Model Type value provided was invalid.")
            print("\tModel Type = " + str(args[1]))
            return

        try:
            locationid = int(args[0])
        except:
            print("[ERROR] The ID specified was not valid")
            print("\tID = " + str(locationid) + "; Model = " + model)
            return

        start = time()
        if model == 'ALL':
            this_vector = Vectorizer.visual_vector_multimodel(locationid, self.database, self.valid_vis_models)
            nearest = Neighbor.knn_visual_all(k, locationid, self.valid_vis_models, self.database, this_vector = this_vector)
        else:
            # get vector representing item associated w/ id
            this_vector = Vectorizer.visual_vector(locationid, self.database, model)
            nearest = Neighbor.knn_visual(k, locationid, model, self.database, this_vector=this_vector)
        
        contribs = Neighbor.visual_sim_contribution(this_vector, nearest.keys(), self.database,
                                                    model, k=3)
        print("The Visual Descriptor Nearest Neighbor took " + str(time() - start))

        print(str(k) + " Nearest Neighbors:")
        for i, (an_id, distance) in enumerate(nearest.items()):
            print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance))
        print('Top 3 Image Pairs:')
        for i, item in enumerate(contribs):
            print('\t' + str(i) + '. ' + str(item))
class Graph:
    """ The graph of nodes and edges 
    
    ...

    Attributes
    ----------
    nodes : List[Node]
        List of nodes in the graph
    num_of_nodes : int
        Total number of nodes
    num_of_edges : List[int]
        List of number of edges of nodes
    vectorizer : Vectorizer
        Vectorizer object that return vector representation of entities

    Methods
    -------
    add_node(node)
        Adds a node to graph
    create_graph()
        Creates all edges between nodes based on cosine-similarity
    """
    def __init__(self, term_dict: Dict[str, Term]):
        self.nodes = []
        self.num_of_nodes = 0
        self.num_of_edges = []
        self.vectorizer = Vectorizer(term_dict)

    def add_node(self, node: Node):
        self.nodes.append(node)
        self.num_of_nodes += 1

    def create_graph(self):

        if type(self.nodes[0].body) == Document:
            self.vectorizer.vectorize_docs([node.body for node in self.nodes])
        else:
            self.vectorizer.vectorize_sentences(
                [node.body for node in self.nodes])

        self.num_of_edges = [1 for i in range(self.num_of_nodes)]

        for i in range(self.num_of_nodes):
            for j in range(i + 1, self.num_of_nodes):
                node1 = self.nodes[i]
                node2 = self.nodes[j]
                similarity = cos_sim(node1.body.vector, node2.body.vector)
                if similarity > 0.10:
                    node1.add_edge(Edge(node2.id, similarity))
                    node2.add_edge(Edge(node1.id, similarity))
                    self.num_of_edges[i] += 1
                    self.num_of_edges[j] += 1
Example #5
0
    def knn_textual(k, an_id, model, item_type, database, this_vector=None):
        nearest = {}
        if not this_vector:
            this_vector = Vectorizer.text_vector(an_id, database, model,
                                                 item_type)

        # Get the id's for all items of the same type.
        if item_type == 'photo':
            an_id = int(an_id)
            others = database.get_photo_ids()
        elif item_type == 'user':
            others = database.get_user_ids()
        elif item_type == 'poi':
            an_id = int(an_id)
            others = database.get_location_ids()
        else:
            raise ValueError(
                '[ERROR] The provided type was invalid.\ntype = ' +
                str(item_type))

        # remove this id from the list
        others.remove(an_id)

        # for each other one, get their vector and calculate distance.
        for other in others:

            other_vector = Vectorizer.text_vector(other, database, model,
                                                  item_type)
            if len(other_vector
                   ) == 0:  # Find any elements with no textual descriptors
                continue

            distance = Distance.l_p_distance(3, this_vector, other_vector)

            if len(nearest) < k:
                largest_key, largest_best = None, inf
            else:
                largest_key, largest_best = max(nearest.items(),
                                                key=itemgetter(1))

            if distance < largest_best:
                # remove the key with the largest distance if it exists
                if largest_key:
                    nearest.pop(largest_key)

                nearest[other] = distance

            if all([value == 0 for value in nearest.values()]):
                break

        # Return your K nearest
        return nearest
    def nearest_text(self, *args):
        """
            arg[0] = type (user, photo, poi)
            arg[1] = id
            arg[2] = model (tf, df, tf-idf)
            arg[3] = k
        """
        
        if not self.database:
            print("[ERROR] The database must be loaded for this action.")
            return

        if not len(args) is 4:
            print("Nearest Text expected 4 arguments but got " + str(len(args)) + ".")
            print("\targs = " + str(args))
            return

        # Get the first argument
        try:
            k = int(args[3])
        except:
            print("[ERROR] K Value provided is invalid.")
            print("\tk = " + str(args[0]))
            return
        
        # Get the type of item we are considering.
        itype = args[0]
        if not itype in self.valid_types:
            print("[ERROR] Item Type value provided was invalid.")
            print("\tItem Type = " + str(args[1]))
            return
        
        # Get the model to use. We do this before the item as it is easier
        #   to differentiate valid from invalid
        model = args[2]
        model = model.lower()
        if not model in self.valid_txt_models:
            print("[ERROR] Model Type value provided was invalid.")
            print("\tModel Type = " + str(args[2]))
            return

        try:
            # get vector representing item associated w/ id
            an_id = args[1]
            this_vector = Vectorizer.text_vector(an_id, self.database, model, itype)
        except:
            print("[ERROR] The ID specified was not found in the dataset.")
            print("\tID = " + an_id + "; Type = " + itype + "; Model = " + model)
            return
        
        nearest = Neighbor.knn_textual(k, an_id, model, itype, self.database, this_vector=this_vector)
        contribs = Neighbor.similarity_by_id(this_vector, nearest.keys(), 
                                            self.database, model, itype)

        print(str(k) + " Nearest Neighbors:")
        for i, (an_id, distance) in enumerate(nearest.items()):
            print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance))
        print('Top 3 Features:')
        for i, item in enumerate(contribs):
            print('\t' + str(i) + '. ' + str(item))
Example #7
0
 def visual_sim_contribution(this_vector, ids, database, model, k=3):
     other_vectors = [
         Vectorizer.visual_vector(locid, database, model) for locid in ids
     ]
     return Neighbor.similarity_contribution(this_vector,
                                             other_vectors,
                                             k,
                                             positional=True)
Example #8
0
class Summarizer():
    def __init__(self, w2v_path=r".\data\vi.vec"):
        self.clearner = Preprocessor()
        self.vectorizer = Vectorizer(w2v_path)

    def summarize(self, paragraph, mode="clustering", keep_sentences=5):
        origin_sentence = sent_tokenize(paragraph)
        sentences = self.clearner.preprocessing(paragraph)
        sent_vectors = self.vectorizer.vectorize(sentences)  # row vector

        if mode == "clustering":
            kmeans = KMeans(n_clusters=keep_sentences)
            kmeans = kmeans.fit(sent_vectors)
            avg = []
            for j in range(keep_sentences):
                idx = np.where(kmeans.labels_ == j)[0]
                avg.append(np.mean(idx))
            closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                                       sent_vectors)
            # top_sentences = sorted(range(n_clusters), key=lambda k: avg[k])
            top_sentences = sorted(closest)

        elif mode == "lsa":
            # input: column vector
            sent_vectors_t = sent_vectors.T
            U, S, VT = np.linalg.svd(sent_vectors_t)
            saliency_vec = np.dot(np.square(S), np.square(VT))
            top_sentences = saliency_vec.argsort()[-keep_sentences:][::-1]
            top_sentences.sort()

        else:
            sim_mat = np.zeros([len(sentences), len(sentences)])
            for i in range(len(sentences)):
                for j in range(len(sentences)):
                    if i != j:
                        sim_mat[i][j] = cosine_similarity(
                            sent_vectors[i].reshape(1, -1),
                            sent_vectors[j].reshape(1, -1))[0][0]

            nx_graph = nx.from_numpy_array(sim_mat)
            scores = list(nx.pagerank(nx_graph).values())
            top_sentences = np.argsort(scores)[-keep_sentences:][::-1]
            top_sentences.sort()
        summary = " ".join([origin_sentence[i] for i in top_sentences])
        return summary, top_sentences
Example #9
0
def main():
    global serial, MIN_Y, MAX_Y, MIN_X, MAX_X

    if not len(sys.argv) == 3:
        print 'Wrong args'
        sys.exit(0)

    RES_X = int(sys.argv[1])

    if RES_X < 0:
        data = Rasterizer()
        points = data.get_lines(sys.argv[2], MIN_X, MAX_X, MIN_Y, MAX_Y)
    else:
        data = Vectorizer()
        points = data.get_polygons(sys.argv[2], RES_X, MIN_X, MAX_X, MIN_Y,
                                   MAX_Y)

    serial = Serial(SERIAL_PORT, BAUD)
    serial.flushInput()
    serial.flushOutput()

    print 'Waiting for MCU'

    # Wait until the mcu sends a byte, signaling it's ready
    serial.read(1)

    print 'Starting transmission'

    count = 1
    for cur_p in points:
        next_x = cur_p[0]
        next_y = cur_p[1]
        next_z = cur_p[2]
        data = struct.pack('<ffb', next_x, next_y, next_z)
        send_wait_ack(data)

        print 'Sent point %d of %d\n' % (count, len(points))
        count += 1

    # Send end of transmission
    send_wait_ack(END_DATA)

    raw_input("press enter to continue")
import os
from vectorize import Vectorizer

DATA_DIR = "../../data/movie/polarity_dataset_v1.0/tokens"

def inpYielder():
    root = DATA_DIR
    neg_dir = os.path.join(root, "neg")
    for fileName in os.listdir(neg_dir):
        print fileName
        f = open(os.path.join(neg_dir, fileName), "r")
        yield 0, f.read()
        f.close()
        
    pos_dir = os.path.join(root, "pos")
    for fileName in os.listdir(pos_dir):
        print fileName
        f = open(os.path.join(pos_dir, fileName), "r")
        yield 1, f.read()
        f.close()
    
vectorizer = Vectorizer(\
    "Bo Pang's movie data", \
    "Movie", \
    "polarity_dataset_v1.0", \
    ngram = 1,\
)

vectorizer.run(inpYielder)
Example #11
0
 def similarity_by_id(this_vector, ids, database, model, itemtype, k=3):
     other_vectors = [
         Vectorizer.text_vector(an_id, database, model, itemtype)
         for an_id in ids
     ]
     return Neighbor.similarity_contribution(this_vector, other_vectors, k)
import os.path
from csv import reader
from vectorize import Vectorizer

PATH = "../../data/movie/amazon"


def score_review_pairs():
    fi = reader(open(os.path.join(PATH, "processed_reviews.tsv"), "r"),
                delimiter="\t",
                quotechar="\"")
    for score, review in fi:
        yield int(score), review


vectorizer = Vectorizer(
    "Amazon dvd",
    "Movie",
    "amazon",
    ngram=3,
)

vectorizer.run(score_review_pairs)
 def __init__(self, term_dict: Dict[str, Term]):
     self.nodes = []
     self.num_of_nodes = 0
     self.num_of_edges = []
     self.vectorizer = Vectorizer(term_dict)
import os.path
from csv import reader
from vectorize import Vectorizer

PATH = "../../data/movie/amazon"

def score_review_pairs():
    fi = reader(
        open(os.path.join(PATH, "processed_reviews.tsv"), "r"),
        delimiter="\t",
        quotechar="\""
    )
    for score, review in fi:
        yield int(score), review
        
vectorizer = Vectorizer(
    "Amazon dvd",
    "Movie", 
    "amazon", 
    ngram=3,
)

vectorizer.run(score_review_pairs)
import os
from vectorize import Vectorizer

DATA_DIR = "../../data/movie/polarity_dataset_v1.0/tokens"


def inpYielder():
    root = DATA_DIR
    neg_dir = os.path.join(root, "neg")
    for fileName in os.listdir(neg_dir):
        print fileName
        f = open(os.path.join(neg_dir, fileName), "r")
        yield 0, f.read()
        f.close()

    pos_dir = os.path.join(root, "pos")
    for fileName in os.listdir(pos_dir):
        print fileName
        f = open(os.path.join(pos_dir, fileName), "r")
        yield 1, f.read()
        f.close()

vectorizer = Vectorizer(\
    "Bo Pang's movie data", \
    "Movie", \
    "polarity_dataset_v1.0", \
    ngram = 1,\
)

vectorizer.run(inpYielder)
Example #16
0
 def __init__(self, w2v_path=r".\data\vi.vec"):
     self.clearner = Preprocessor()
     self.vectorizer = Vectorizer(w2v_path)