Beispiel #1
0
def load_test_data(filename, subreddits):
    instances = []

    # each Subreddit's index in the FeatureVector is it's position in subreddits[]
    # iterate through file again
    with open(filename) as reader:
        for line in reader:
            if len(line.strip()) == 0:
                continue

            split_line = line.split(",")
            label = ClassificationLabel(split_line[0])
            split_line.pop(0)

            feature_vector = FeatureVector()

            for subreddit in split_line:
                # sometimes there is an extraneous "\n"
                if "\n" in subreddit:
                    subreddit = subreddit.replace("\n", "")
                if subreddit in subreddits:
                    feature = subreddits.index(subreddit)
                feature_vector.add(feature, 1)

            instance = Instance(feature_vector, label)
            instances.append(instance)

    return instances
 def predict(self, instance):
     max_k, max_val = -1, -float('inf')
     for k in range(1, max(self.all_labels) + 1):
         val = self.compute_dot_product(
             instance._feature_vector.feature_vector, k)
         if val > max_val:
             max_k, max_val = k, val
         elif val == max_val and k < max_k:
             max_k = k
     return ClassificationLabel(max_k)
Beispiel #3
0
def load_data(filename):
    instances = []
    global indexmax
    indexmax = 1
    with open(filename) as reader:

        for line in reader:
            split_line = line.split(" ")

            for item in split_line[1:]:
                index = int(item.split(":")[0])
                if index > indexmax:
                    indexmax = index
        indexmax = indexmax + 1
    if pastindexmax != -1:
        indexmax = pastindexmax
    with open(filename) as reader:

        for line in reader:
            if len(line.strip()) == 0:
                continue

            split_line = line.split(" ")
            label_string = split_line[0]

            int_label = -1
            try:
                int_label = int(label_string)
            except ValueError:
                raise ValueError("Unable to convert " + label_string +
                                 " to integer.")

            label = ClassificationLabel(int_label)

            feature_vector = FeatureVector(indexmax)

            for item in split_line[1:]:
                try:
                    index = int(item.split(":")[0])
                except ValueError:
                    raise ValueError("Unable to convert index " +
                                     item.split(":")[0] + " to integer.")
                try:
                    value = float(item.split(":")[1])
                except ValueError:
                    raise ValueError("Unable to convert value " +
                                     item.split(":")[1] + " to float.")

                if value != 0.0:
                    if index <= indexmax:
                        feature_vector.add(index, value)

            instance = Instance(feature_vector, label)
            instances.append(instance)
    return instances
def load_data(filename):
    """Function for loading the features from a file into instances"""
    instances = []
    with open(filename) as reader:
        # print('adding to the instances')
        global max_size
        max_size = 0
        global max_max_index
        max_max_index = 0
        for line in reader:
            if len(line.strip()) == 0:
                continue

            # Divide the line into features and label.
            split_line = line.split(" ")
            label_string = split_line[0]

            int_label = -1
            try:
                int_label = int(label_string)
            except ValueError:
                raise ValueError("Unable to convert " + label_string +
                                 " to integer.")

            label = ClassificationLabel(int_label)
            feature_vector = FeatureVector()

            for item in split_line[1:]:
                try:
                    index = int(item.split(":")[0])

                except ValueError:
                    raise ValueError("Unable to convert index " +
                                     item.split(":")[0] + " to integer.")
                try:
                    value = float(item.split(":")[1])
                except ValueError:
                    raise ValueError("Unable to convert value " +
                                     item.split(":")[1] + " to float.")

                if value != 0.0:
                    feature_vector.add(index, value)

            instance = Instance(feature_vector, label)
            instances.append(instance)
            # if feature_vector._size > max_size:
            #     max_size = feature_vector._size
            if feature_vector._max_index > max_max_index:
                max_max_index = feature_vector._max_index
                # print('finished adding')

    return instances
Beispiel #5
0
 def load_instances(self):
     filename = "output/word_frequencies.txt"
     with open(filename) as reader:
         for line in reader:
             split_line = line.split(" ")
             label = ClassificationLabel(split_line[0])
             split_line.pop(0)
             fv = FeatureVector()
             for word in split_line:
                 if word != "\n":
                     num = word.split(":")
                     fv.add(int(num[0]), int(num[1]))
             instance = Instance(fv, label)
             self.instances.append(instance)
     return self.instances
Beispiel #6
0
def load_more_data(filename):
    subreddits = []
    instances = []

    # figure out what all the possible Subreddits are
    with open(filename) as reader:
        for line in reader:
            if len(line.strip()) == 0:
                continue

            split_line = line.split(",")
            split_line.pop(0)
            for subreddit in split_line:
                # sometimes there is an extraneous "\n"
                if "\n" in subreddit:
                    subreddit = subreddit.replace("\n", "")
                if subreddit not in subreddits:
                    subreddits.append(subreddit)

    # each Subreddit's index in the FeatureVector is it's position in self.subreddits[]
    # iterate through file again
    counter = 0
    with open(filename) as reader:
        for line in reader:
            if len(line.strip()) == 0:
                continue

            split_line = line.split(",")
            label = ClassificationLabel(split_line[0])
            split_line.pop(0)

            feature_vector = FeatureVector()

            for subreddit in split_line:
                # sometimes there is an extraneous "\n"
                if "\n" in subreddit:
                    subreddit = subreddit.replace("\n", "")
                feature = subreddits.index(subreddit)
                feature_vector.add(feature, 1)

            instance = Instance(feature_vector, label)
            instances.append(instance)

            counter += 1
            #if counter % 100 == 0:
            #print(counter)

    return (instances, subreddits)
def load_data(filename):
    instances = []
    #added
    highest_idx = 0
    with open(filename) as reader:
        for line in reader:
            if len(line.strip()) == 0:
                continue

            # Divide the line into features and label.
            split_line = line.split(" ")
            label_string = split_line[0]

            int_label = -1
            try:
                int_label = int(label_string)
            except ValueError:
                raise ValueError("Unable to convert " + label_string +
                                 " to integer.")

            label = ClassificationLabel(int_label)
            feature_vector = FeatureVector()

            for item in split_line[1:]:
                try:
                    index = int(item.split(":")[0])
                    #added
                    if (index > highest_idx): highest_idx = index
                except ValueError:
                    raise ValueError("Unable to convert index " +
                                     item.split(":")[0] + " to integer.")
                try:
                    value = float(item.split(":")[1])
                except ValueError:
                    raise ValueError("Unable to convert value " +
                                     item.split(":")[1] + " to float.")

                if value != 0.0:
                    feature_vector.add(index, value)

            instance = Instance(feature_vector, label)
            instances.append(instance)
    #added
    return instances, highest_idx
Beispiel #8
0
 def create_instances(self):
     counter = 0.0
     length = len(self.descriptions)
     filename = "output/word_frequencies.txt"
     fo = open(filename, "wb")
     for d in self.descriptions:
         stripped = self.clean_text(d['description'])
         label = ClassificationLabel(int(counter))
         fv = FeatureVector()
         for word in stripped:
             feature = self.corpus.index(word)
             fv.add(feature, int(fv.get(feature) + 1))
         instance = Instance(fv, label)
         self.instances.append(instance)
         line = str(label) + " "
         for f in fv.get_keys():
             line += str(f) + ":" + str(fv.get(f)) + " "
         fo.write(line.encode('utf8') + "\n")
         counter += 1
         self.update_progress(float(counter / length))
     fo.close()
def load_data(filename):
    """Function for loading the features from a file into instances"""
    count = 0 # 
    count2 = 0 # REMOVE

    instances = []
    with open(filename) as reader:
        for line in reader:
            if len(line.strip()) == 0:
                continue
            
            # Divide the line into features and label.
            split_line = line.split(" ")
            label_string = split_line[0]

            int_label = -1
            try:
                int_label = int(label_string)
            except ValueError:
                raise ValueError("Unable to convert " + label_string + " to integer.")

            label = ClassificationLabel(int_label)
            feature_vector = FeatureVector()

            # while (counter < 10):   # REMOVE
            #     print("%s" % str(label))         # REMOVE
            #     counter += 1
            
            for item in split_line[1:]:
                try:
                    index = int(item.split(":")[0])

                except ValueError:
                    raise ValueError("Unable to convert index " + item.split(":")[0] + " to integer.")
                try:
                    value = float(item.split(":")[1])
                except ValueError:
                    raise ValueError("Unable to convert value " + item.split(":")[1] + " to float.")
                
                if value != 0.0:
                    # if (count2 < 10):   # REMOVE
                    #     print("index = %f" % index)         # REMOVE
                    #     print("value = %f" % value)
                    #     count2 += 1
                    feature_vector.add(index, value)

            # print("num non zero = %d", feature_vector.get_lil_matrix().count_nonzero())

            instance = Instance(feature_vector, label)
            instances.append(instance)
            # print('label = %d' % label._class)

    #REMOVE
    fv2 = coo_matrix(instances[1].get_feature_vector().get_lil_matrix())
    # print("row = %d" % fv2.row)
    # print("col = %d" % fv2.col)
    # for i, j, v in zip(fv2.row, fv2.col, fv2.data):
    #     if (count < 10):
    #         print ("shit = (%d, %d), %s" % (i,j,v))
    #         count += 1
    # print("num non zero = %d" % fv2.count_nonzero())

    return instances
 def predict(self, instance):
     return ClassificationLabel(
         self.sign(instance._feature_vector.feature_vector))
 def predict(self, instance):
     return ClassificationLabel(1) if self.compute_dot_product(
         instance) >= 0 else ClassificationLabel(0)