Exemple #1
0
    def predict_svm(self, example):
        '''
        :param example: str (example comment)
        :return: str (constructiveness prediction for the example)

        Description:
        Given a comment example, example, this class method returns whether the comment
        is constructive or not based on the trained model for constructiveness.
        '''

        # Build a feature vector for the example
        example_df = pd.DataFrame.from_dict({
            'pp_comment_text': [example],
            'constructive': ['?']
        })
        print(example_df)
        fe = FeatureExtractor(example_df)
        fe.extract_features()
        feats_df = fe.get_features_df()

        # Get the prediction score and find the winner
        prediction = self.svm_pipeline.predict(feats_df)[0]
        prediction_winner = 'Non-constructive' if prediction == 0 else 'Constructive'

        return prediction_winner.upper()
Exemple #2
0
def main():

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    if EXTRACT_FEATURES:
        extractor = FeatureExtractor(DATA_ROOT_DIR, SEQUENCE_SIZE)
        extractor.extract_features()

    # training dataset
    training_dataset = TypeNetDataset(DATA_ROOT_DIR, ENROLLMENT_SEQUENCES,
                                      PARTS_TRAINING)

    # # validation dataset
    validation_dataset = TypeNetDataset(DATA_ROOT_DIR, ENROLLMENT_SEQUENCES,
                                        PARTS_VAL)
    val_loader = torch.utils.data.DataLoader(validation_dataset,
                                             batch_size=batch_size,
                                             shuffle=True)

    # test dataset
    # test_dataset = TypeNetDatasetTest(DATA_ROOT_DIR, PARTS_TEST)
    # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

    # initializing model
    model = SiameseNet(input_size, hidden_size, num_layers).to(device)

    # adam optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # training
    for epoch in range(num_epochs):
        if save_states:
            torch.save(model.state_dict(),
                       'states/siamese_typenet_{:03}.pt'.format(epoch))

        if epoch % 5 == 0:
            validation(model, device, val_loader, epoch)

        indices = sample(range(0, len(training_dataset)),
                         batch_size * BATCHS_PER_ITER)
        training_subset = torch.utils.data.Subset(training_dataset, indices)
        train_loader = torch.utils.data.DataLoader(training_subset,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        train(model, device, train_loader, epoch, optimizer)

    if save_states:
        torch.save(model.state_dict(), 'states/siamese_typenet_final.pt')
def extract_features(json_file):
    class_id = json_file.split('_')[-1]
    class_id = int(class_id[0])
    feature_extractor = FeatureExtractor(json_file)
    feature_list = feature_extractor.extract_features()

    return feature_list, class_id
class TestFeatureExtractor(unittest.TestCase):
    '''
    Unit tests for the FeatureExtractor class. Does simple tests to insure that 
    the feature vector we get back is of the right length and has frequency
    data that makes sense. More tests should be added.
    ''' 
    def setUp(self):
        '''Sets up the test by constructing feature vectors to get tested'''       
        self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
            IUPAC.protein),
            id="YP_025292.1", name="HokC",
            description="toxic membrane protein, small")        
        self.seq1 = self.record1.seq
        self.feature_extractor = FeatureExtractor()  
        self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)
        
    def test_feature_vector_length(self):
        '''Tests that the feature vector is 400 elements long'''
        self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long")
        
    def test_dipeptide_frequency_sum(self):
        '''Tests that the dipeptide frequencies sum to 1'''
        checksum = 0.0
        for i in range(0,400):
            checksum += self.feature_vector1[i]
        self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")
Exemple #5
0
def preprocessing(params):
    # filter the logs
    filter_logs = Filter(params)
    filter_settings = filter_logs.get_filter_settings()
    filter_logs.filter_logs()
    # if template file not available, generate template
    # filter settings needed to get svn branch path for particular version
    # currently accesses svn server directly, TO DO: have local back up copy in case svn server is down?
    templatizer = Templatizer(filter_settings=filter_settings)
    component_template = templatizer.gen_template()
    # extract features from log, using template
    extractor = FeatureExtractor(component_template=component_template,
                                 techdump_filename=params['filename'],
                                 filter_settings=filter_settings)
    return component_template, extractor.extract_features()
    def extract_results(self):
        overall_extracted_log_features = []
        overall_test_case_labels = []
        component_template = {}
        features_template = {}
        base_path = os.path.basename(self.start_path).split("-")
        if len(base_path) == 2:
            start_index = int(base_path[1])
            end_index = int(os.path.basename(self.end_path).split("-")[1])
            base_path = os.path.join(os.path.dirname(self.start_path),
                                     base_path[0]) + "-"
            for index in range(start_index, end_index + 1):
                test_result = base_path + str(index)
                acp_version, test_case_labels = self.parse_results(test_result)
                overall_test_case_labels.append(test_case_labels)
                extracted_log_features = {}
                for test in test_case_labels:
                    test_path = os.path.join(test_result, test)
                    if os.path.isdir(test_path):
                        test_result_dir = os.path.join(
                            self.mode, os.path.basename(test_result))
                        techdump_rel_path = os.path.join(test_result_dir, test)
                        params = {
                            "channel_number": self.channel_number,
                            "src_txt_result_dir_path": test_path,
                            "techdump_name": test,
                            "test_result_dir": test_result_dir
                        }
                        log_filter = Filter(params, filter_type="txt_result")
                        filter_settings = log_filter.get_filter_settings()
                        log_filter.filter_logs()
                        templatizer = Templatizer(acp_version=acp_version)
                        component_template, features_template = templatizer.gen_template(
                        )
                        # extract features from log, using template
                        extractor = FeatureExtractor(
                            component_template=component_template,
                            techdump_filename=techdump_rel_path,
                            filter_settings=filter_settings)
                        extracted_log_features.update(
                            {test: extractor.extract_features()})
                overall_extracted_log_features.append(extracted_log_features)

        else:
            logger.error(
                "Invalid start path provided. Needs to contain '-' in last directory %s"
                % self.start_path)
        return component_template, features_template, overall_extracted_log_features, overall_test_case_labels
def train_dataset_parser(train_path):
    fe = FeatureExtractor()
    bag_of_words = set()
    images_labels = list()

    # Extract labels and create bag of words
    for image_name in sorted(os.listdir(train_path)):
        token_list = [int(l) for l in re.findall(r"[\d']+", image_name)]

        images_labels.append((image_name, token_list[1:]))
        bag_of_words = bag_of_words.union(set(token_list[1:]))

    #
    #
    #
    bag_of_words = list(bag_of_words)
    binary_labels = list()
    data = list()
    with open('../dataset/features_train.csv', 'w') as f:
        # Write bag of words the file
        f.write('{0}\n'.format(','.join([str(x) for x in bag_of_words])))

        for image_name, labels in images_labels:
            binary_vector = [0] * len(bag_of_words)

            for label in labels:
                binary_vector[bag_of_words.index(label)] = 1

            binary_labels.append(binary_vector)

            # Extract features
            spatial_features = fe.extract_features(os.path.join(train_path, image_name))
            data.append(spatial_features)

            # Write the extracted features to the file
            f.write('{0},{1}\n'.format(
                ','.join([str(x) for x in spatial_features]),
                ','.join([str(x) for x in binary_vector])
            ))

    return data, binary_labels, bag_of_words
def test_dataset_parser(test_path):
    fe = FeatureExtractor()

    data = list()
    image_ids = list()
    with open('../dataset/features_test.csv', 'w') as f:
        for image_name in os.listdir(test_path):
            # Extract image ID
            image_id = image_name.split('.')[0]
            image_ids.append(image_id)

            # Extract features
            spatial_features = fe.extract_features(os.path.join(test_path, image_name))
            data.append(spatial_features)

            # Write the extracted features to the file
            f.write('{0},{1}\n'.format(
                image_id,
                ','.join([str(x) for x in spatial_features]),

            ))

    return data, image_ids
class OdometryEstimator:
    DISTANCE_SQ_THRESHOLD = 1
    SCAN_VICINITY = 2.5

    def __init__(self):
        self.extractor = FeatureExtractor()

        self.inited = False
        self.last_less_sharp_points = None
        self.last_less_flat_points = None
        self.last_position = np.eye(4)

    def append_pcd(self, pcd):
        sharp_points, less_sharp_points, flat_points, less_flat_points = self.extractor.extract_features(
            pcd[0], pcd[1], pcd[2])
        T = None
        if not self.inited:
            self.inited = True
            T = np.zeros(6)
        else:
            edge_corresp = self.find_edge_correspondences(sharp_points)
            surface_corresp = self.find_surface_correspondences(
                flat_points, pcd)
            optimizer = LOAMOptimizer(edge_corresp, surface_corresp)
            T = optimizer.optimize()
            import utils
            surf = np.vstack(
                (surface_corresp[1], surface_corresp[2], surface_corresp[3]))
            keypoints = utils.get_pcd_from_numpy(surf)
            keypoints.paint_uniform_color([0, 1, 0])
            pcd = utils.get_pcd_from_numpy(
                mrob.geometry.SE3(T).transform_array(pcd[0]))
            pcd.paint_uniform_color([0, 0, 1])
            orig = utils.get_pcd_from_numpy(surface_corresp[0])
            orig.paint_uniform_color([1, 0, 0])
            # o3d.visualization.draw_geometries([pcd, keypoints, orig])

        self.last_less_sharp_points = np.vstack(less_sharp_points)
        x = get_pcd_from_numpy(np.vstack(less_flat_points))
        y = np.vstack(less_flat_points)[:, 3].reshape((-1, 1)) / 64
        x.colors = o3d.utility.Vector3dVector(np.hstack((y, y, y)))
        x = x.voxel_down_sample(0.1)
        self.last_less_flat_points = np.hstack(
            (np.asarray(x.points), 64 * np.asarray(x.colors)[:, 0].reshape(
                (-1, 1))))
        scan_ids = self.last_less_flat_points[:, 3]
        sorted_ind = np.argsort(scan_ids, kind='stable')
        self.last_less_flat_points = self.last_less_flat_points[sorted_ind]
        self.last_position = mrob.geometry.SE3(T).T() @ self.last_position

        return mrob.geometry.SE3(
            T).T(), self.last_less_flat_points, self.last_less_flat_points

    def find_edge_correspondences(self, sharp_points):
        corners_cnt = len(sharp_points)

        edge_points = []
        edge_1 = []
        edge_2 = []
        less_sharp_points_tree = o3d.geometry.KDTreeFlann(
            get_pcd_from_numpy(self.last_less_sharp_points))
        for i in range(corners_cnt):
            point_sel = sharp_points[i]
            _, idx, dist = less_sharp_points_tree.search_knn_vector_3d(
                point_sel[:3], 1)
            min_point_ind_2 = -1
            if dist[0] < self.DISTANCE_SQ_THRESHOLD:
                closest_point_ind = idx[0]
                min_point_sq_dist_2 = self.DISTANCE_SQ_THRESHOLD
                closest_point_scan_id = self.last_less_sharp_points[
                    closest_point_ind][3]

                dist_to_sel_point = matrix_dot_product(
                    (self.last_less_sharp_points[:, :3] - point_sel[:3]),
                    (self.last_less_sharp_points[:, :3] - point_sel[:3]))

                for j in range(closest_point_ind + 1,
                               len(self.last_less_sharp_points)):
                    if self.last_less_sharp_points[j][
                            3] <= closest_point_scan_id:
                        continue
                    if self.last_less_sharp_points[j][
                            3] > closest_point_scan_id + self.SCAN_VICINITY:
                        break

                    point_sq_dist = dist_to_sel_point[j]
                    if point_sq_dist < min_point_sq_dist_2:
                        min_point_sq_dist_2 = point_sq_dist
                        min_point_ind_2 = j

                for j in range(closest_point_ind - 1, -1, -1):
                    if self.last_less_sharp_points[j][
                            3] >= closest_point_scan_id:
                        continue
                    if self.last_less_sharp_points[j][
                            3] < closest_point_scan_id - self.SCAN_VICINITY:
                        break

                    point_sq_dist = dist_to_sel_point[j]
                    if point_sq_dist < min_point_sq_dist_2:
                        min_point_sq_dist_2 = point_sq_dist
                        min_point_ind_2 = j

                if min_point_ind_2 >= 0:
                    edge_points.append(point_sel)
                    edge_1.append(
                        self.last_less_sharp_points[closest_point_ind])
                    edge_2.append(self.last_less_sharp_points[min_point_ind_2])

        edge_points = np.vstack(edge_points)[:, :3]
        edge_1 = np.vstack(edge_1)[:, :3]
        edge_2 = np.vstack(edge_2)[:, :3]

        return edge_points, edge_1, edge_2

    def find_surface_correspondences(self, flat_points, pcd):
        surface_cnt = len(flat_points)
        print('Surface count: ', surface_cnt)

        surface_points = []
        surface_1 = []
        surface_2 = []
        surface_3 = []

        less_flat_points_tree = o3d.geometry.KDTreeFlann(
            get_pcd_from_numpy(self.last_less_flat_points))
        for i in range(surface_cnt):
            point_sel = flat_points[i]
            _, idx, dist = less_flat_points_tree.search_knn_vector_3d(
                point_sel[:3], 1)
            min_point_ind_2 = -1
            min_point_ind_3 = -1

            dist_to_sel_point = matrix_dot_product(
                (self.last_less_flat_points[:, :3] - point_sel[:3]),
                (self.last_less_flat_points[:, :3] - point_sel[:3]))

            closest_point_ind = idx[0]
            v = self.last_less_flat_points[
                closest_point_ind][:3] - point_sel[:3]
            dist = np.dot(v, v)
            if dist < self.DISTANCE_SQ_THRESHOLD:
                closest_point_scan_id = self.last_less_flat_points[
                    closest_point_ind][3]
                min_point_sq_dist_2 = self.DISTANCE_SQ_THRESHOLD
                min_point_sq_dist_3 = self.DISTANCE_SQ_THRESHOLD
                for j in range(closest_point_ind + 1,
                               len(self.last_less_flat_points)):
                    if self.last_less_flat_points[j][
                            3] > closest_point_scan_id + self.SCAN_VICINITY:
                        break

                    point_sq_dist = dist_to_sel_point[j]

                    if self.last_less_flat_points[j][3] <= closest_point_scan_id \
                            and point_sq_dist < min_point_sq_dist_2:
                        min_point_sq_dist_2 = point_sq_dist
                        min_point_ind_2 = j
                    elif self.last_less_flat_points[j][3] > closest_point_scan_id \
                            and point_sq_dist < min_point_sq_dist_3:
                        min_point_sq_dist_3 = point_sq_dist
                        min_point_ind_3 = j

                for j in range(closest_point_ind - 1, -1, -1):
                    if self.last_less_flat_points[j][
                            3] < closest_point_scan_id - self.SCAN_VICINITY:
                        break

                    point_sq_dist = dist_to_sel_point[j]

                    if self.last_less_flat_points[j][3] >= closest_point_scan_id \
                            and point_sq_dist < min_point_sq_dist_2:
                        min_point_sq_dist_2 = point_sq_dist
                        min_point_ind_2 = j
                    elif self.last_less_flat_points[j][3] < closest_point_scan_id \
                            and point_sq_dist < min_point_sq_dist_3:
                        min_point_sq_dist_3 = point_sq_dist
                        min_point_ind_3 = j

                if min_point_ind_2 >= 0 and min_point_ind_3 >= 0:
                    surface_points.append(point_sel)
                    surface_1.append(
                        self.last_less_flat_points[closest_point_ind])
                    surface_2.append(
                        self.last_less_flat_points[min_point_ind_2])
                    surface_3.append(
                        self.last_less_flat_points[min_point_ind_3])

        surface_points = np.vstack(surface_points)
        surface_1 = np.vstack(surface_1)
        surface_2 = np.vstack(surface_2)
        surface_3 = np.vstack(surface_3)

        print('output: ', surface_points.shape[0])
        import utils
        # import open3d as 0o3d
        ind = surface_1[:, 3] > 0
        surf = np.vstack((surface_1[ind], surface_2[ind], surface_3[ind]))
        keypoints = utils.get_pcd_from_numpy(surf)
        keypoints.paint_uniform_color([0, 1, 0])
        pcd = utils.get_pcd_from_numpy(pcd[0])
        pcd.paint_uniform_color([0, 0, 1])
        orig = utils.get_pcd_from_numpy(surface_points[ind])
        orig.paint_uniform_color([1, 0, 0])
        o3d.visualization.draw_geometries([pcd, keypoints, orig])

        return surface_points[ind][:, :3], surface_1[ind][:, :3], surface_2[
            ind][:, :3], surface_3[ind][:, :3]
print(
    'Loaded ' + str(len(test_images_filenames)) +
    ' testing images filenames with classes ', set(test_labels))

# Load precomputed labels if avaliable
precomp_label_filename = classifier + '_' + feature_method + '.npy'
if os.path.isfile(precomp_label_filename) and not force_reload:
    print 'Loading previous predictions'
    predicted_classes = np.load(precomp_label_filename)
else:
    start = time.time()

    print 'Extracting features'
    fe = FeatureExtractor(feature_method)
    (X, y) = fe.extract_features(train_images_filenames,
                                 train_labels,
                                 nimmax=30)

    print 'Training a classifier'
    c = Classifier(classifier)
    c.fit(X, y)

    print 'Predicting test set labels with the classifier'
    numtestimages = 0
    predicted_classes = []
    for i in range(len(test_images_filenames)):
        imfilename = test_images_filenames[i]
        des = fe.extract_single_image_features(imfilename)
        predictedclass = c.predict(des)
        predicted_classes.append(predictedclass)
        print('image ' + imfilename + ' was from class ' + test_labels[i] +
Exemple #11
0
class MultiReader(DataLoader):
    def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False):
        self.input_width = 400
        self.output_width = output_width
        self.training_frac = training_frac
        self.validation_frac = validation_frac
        self.debug = debug
        # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/"      # Where the files live.
        self.names = [  # Names of all of the files.
            "baseplate_3370",
            "collar_1385",
            "htj_2258_nofg",
            "major_tail_1512",
            "mcp_3589",
            "minor_capsid_1500_nofg",
            "minor_tail_2033",
            "portal_2141",
            "tail_fiber_3007",
            "tail_sheath_2350",
        ]

        self.feature_extractor = FeatureExtractor()

    def load_data(self, source):
        """Load the data from a directory with a collection of source files,
        one file for each kind of protein. 
        
        Returns an array of pairs in the form:
        
        [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)]

        :type source:   String
        :param source:  The directory where the source files are located.
        """
        dir = source
        raw_data = list()
        unsupporteds = list()
        for i in range(0, len(self.names)):
            num_in_file = 0
            if self.debug:
                print (dir + self.names[i] + ".faa")
            handle = open(dir + self.names[i] + ".faa", "rU")  # Open a file.
            for record in SeqIO.parse(handle, "fasta"):
                num_in_file += 1
                try:
                    # print "      " + record.id
                    feature_vector = self.feature_extractor.extract_features(record)
                    # Now we have to augment the feature vector with the output
                    # vector. So we:
                    #   1) Make a new array a bit longer than the feature vector,
                    #   2) Copy the feature vector into the first cells of the new array,
                    #   3) Find the appropriate cell in the tail of the new array
                    #      and set that one equal to 1.
                    prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width)
                    for col in range(0, len(feature_vector)):  # This surely could be done more efficiently.
                        prepared_data_record[col] = feature_vector[col]  # Doesn't matter for now.
                    prepared_data_record[
                        len(feature_vector) + i
                    ] = 1  # The class of the protein is taken from the order of the files in the list "names"
                    raw_data.append(prepared_data_record)
                except KeyError:
                    if self.debug:
                        print "   Unsupported sequence: " + record.id + "   " + str(record.annotations)
                    unsupporteds.append(record)
                pass
            handle.close()
            if self.debug:
                print "Total in file " + self.names[i] + " = " + str(num_in_file)

        # Now we are done reading all of the data in. In debug mode, print some
        # overall summary information.
        if self.debug:
            print "Supported Sequences = " + str(len(raw_data))
            print "Unsupported Sequences = " + str(len(unsupporteds))

        num_examples = len(raw_data)

        # But the labeled data we have is not randomly ordered. It is sorted
        # by class. We need to shuffle it up or we will only train on the first
        # classes.
        if self.debug:
            print "Shuffling data to randomize for training"
        shuffle = self.rand_perm(num_examples)

        data = numpy.ndarray((num_examples, self.input_width + self.output_width), float)
        for n in range(0, num_examples):
            for w in range(0, self.input_width + self.output_width):
                s = raw_data[shuffle[n]][w]
                data[n, w] = float(s)
        if self.debug:
            print "Finished shuffling data"
            print "Processing data to cull outliers"
        data = self.preprocess(self.cull(data))
        num_examples = len(data)
        print "Data shape = ", data.shape, "   num_examples=", num_examples
        inputs = numpy.array(data)[:, 0 : self.input_width]
        outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width]
        if self.debug:
            print "Finished culling outliers"
            print inputs.shape
            print outputs_full.shape
        outputs = numpy.ndarray((num_examples,), int)
        for n in range(0, num_examples):
            found_class = False
            for w in range(0, self.output_width):
                if outputs_full[n, w] > 0.5:
                    outputs[n] = w
                    found_class = True
                    break
        num_training_cases = self.num_training(num_examples)
        num_validation_cases = self.num_validation(num_examples)
        num_test_cases = self.num_test(num_examples)

        print num_training_cases, " ", num_validation_cases, " ", num_test_cases
        training_set = (inputs[0:num_training_cases, :], outputs[0:num_training_cases])
        validation_set = (
            inputs[num_training_cases : num_training_cases + num_validation_cases, :],
            outputs[num_training_cases : num_training_cases + num_validation_cases],
        )
        test_set = (
            inputs[num_training_cases + num_validation_cases :, :],
            outputs[num_training_cases + num_validation_cases :],
        )
        training_set_x, training_set_y = theanoutil.shared_dataset(training_set)
        validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set)
        test_set_x, test_set_y = theanoutil.shared_dataset(test_set)

        if self.debug:
            print "TYPE of test_set_x =", type(test_set_x)
            print "TYPE of test_set=", type(test_set), "  SIZE of test_set=", len(test_set)
            print "TYPE of test_set[0]=", type(test_set[0]), "  SHAPE of test_set[0]=", test_set[0].shape
            print "TYPE of test_set[1]=", type(test_set[1]), "  SHAPE of test_set[1]=", test_set[1].shape
            print "VALUE of training_set[0,0,0]=", training_set[0][0, 0]
            print "VALUE of training_set[1,0]=", training_set[1][0], "   test_set[1,0]=", test_set[1][0]

        rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)]
        return rval

    # Everything from here down should be turned into a base class.

    def num_training(self, num_examples):
        return num_examples * (self.training_frac / 100.0)

    def num_validation(self, num_examples):
        return num_examples * (self.validation_frac / 100.0)

    def num_test(self, num_examples):
        return num_examples - (self.num_training(num_examples) + self.num_validation(num_examples))

    def rand_perm(self, length):
        # In debug mode, we want to have a repeatable random number seed so
        # that we can have a repeatable shuffling.
        if self.debug:
            seed(1)
        shuffle = numpy.ndarray((length,), int)
        for n in range(0, length):
            shuffle[n] = n
        for n in range(0, length):
            swap_cell = randint(0, length - 1)
            temp = shuffle[swap_cell]
            shuffle[swap_cell] = shuffle[n]
            shuffle[n] = temp
        return shuffle

    def cull(self, data):
        # Make a list of all row numbers that need to get culled from the data.
        cull_list = []
        for n in range(0, len(data)):
            if self.prune(data[n]):
                cull_list.append(n)
        cull_list.append(len(data))  # A sentinel at the end of the cull list.

        # Make a new array that doesn't have the culled items in it.
        # The 1+ is for the sentinel.
        new_data = numpy.ndarray((1 + len(data) - len(cull_list), self.input_width + self.output_width), float)
        next_cull_index = 0
        next_data_index = 0
        for n in range(0, len(data)):
            if n == cull_list[next_cull_index]:
                next_cull_index += 1
            else:
                new_data[next_data_index] = data[n]
                next_data_index += 1
        print "Number culled = ", len(cull_list) - 1
        return new_data

    def prune(self, example):
        sum = 0.0
        for n in range(0, self.input_width):
            if example[n] < 0.0:
                return True
            if example[n] > 1.0:
                return True
            sum += example[n]
        if sum > 1.01:
            return True
        if sum < 0.99:
            return True
        return False

    def preprocess(self, data):
        n = self.input_width
        for r in range(0, len(data)):
            sum_x = 0.0
            sum_x2 = 0.0
            for c in range(0, n):
                sum_x += data[r, c]
                sum_x2 += data[r, c] * data[r, c]
            mu = sum_x / n
            std = math.sqrt((sum_x2 - (sum_x * sum_x) / n) / n)  # Population std
            for c in range(0, n):
                z = (data[r, c] - mu) / std
                # squashed_z = sigma(z)
                data[r, c] = z
            if r % 1000 == 0:
                print "Preprocessed row ", r
        return data
def main(args):
    graph_dataset = MAGDataset(name="", path=args.data, raw=False)

    node_features = graph_dataset.g_full.ndata['x']
    node_features = F.normalize(node_features, p=2, dim=1)
    vocab = graph_dataset.vocab
    full_graph = graph_dataset.g_full.to_networkx()
    kv = KeyedVectors(vector_size=node_features.shape[1])
    kv.add([str(i) for i in range(len(vocab))], node_features.numpy())

    if args.mode == "train":
        node_list = graph_dataset.train_node_ids
        graph = full_graph.subgraph(graph_dataset.train_node_ids).copy()
    elif args.mode == "validation":
        node_list = graph_dataset.validation_node_ids
        graph = full_graph.subgraph(graph_dataset.train_node_ids +
                                    graph_dataset.validation_node_ids).copy()
    else:
        node_list = graph_dataset.test_node_ids
        graph = full_graph.subgraph(graph_dataset.train_node_ids +
                                    graph_dataset.test_node_ids).copy()

    roots = [node for node in graph.nodes() if graph.in_degree(node) == 0]
    interested_node_set = set(node_list) - set(roots)
    node_list = list(interested_node_set)
    nq = NegativeQueue(node_list.copy() * 5)

    node2parents = {}  # list of correct parent positions
    node2masks = {
    }  # list of positions that should not be chosen as negative positions
    for node in tqdm(graph.nodes(), desc="generating intermediate data ..."):
        parents = [edge[0] for edge in graph.in_edges(node)]
        node2parents[node] = parents
        if node in interested_node_set:
            descendants = nx.descendants(graph, node)
            masks = set(list(descendants) + parents + [node] + roots)
            node2masks[node] = masks

    edge_to_remove = []
    if args.mode == "validation":
        for node in graph_dataset.validation_node_ids:
            edge_to_remove.extend(list(graph.in_edges(node)))
        print(
            f"Remove {len(edge_to_remove)} edges between validation nodes and training nodes"
        )
    graph.remove_edges_from(edge_to_remove)
    print("=== Finish data loading ===\n")

    feature_extractor = FeatureExtractor(graph, kv)
    NEGATIVE_RATIO = args.neg
    featMat = []
    labels = []
    for query_node in tqdm(node_list):
        cnt = 0
        for positive_parent in node2parents[query_node]:
            featMat.append(
                feature_extractor.extract_features(query_node,
                                                   positive_parent))
            labels.append(1)
            cnt += 1

        num_negatives = NEGATIVE_RATIO * cnt
        avoid_set = node2masks[query_node]
        negatives = nq.sample_avoid_positive_set(avoid_set, num_negatives)
        for negative_parent in negatives:
            featMat.append(
                feature_extractor.extract_features(query_node,
                                                   negative_parent))
            labels.append(0)

    data = xgb.DMatrix(np.array(featMat),
                       label=np.array(labels),
                       missing=-999.0)
    data.save_binary(args.output)
                            name + ".entity")
    link_path = os.path.join(DATA_ROOT, "train", "link", name + ".link")
    atts_path = os.path.join(DATA_ROOT, "train", "atts", name + ".atts")
    with open(ing_path) as ing, \
         open(ins_path) as ins, \
         open(link_path) as link, \
         open(atts_path) as atts:
        recipe = PreprocessedRecipe(ing, ins, link, atts)
        samples_in_recipe = generate_samples(recipe)
        for label, pair in samples_in_recipe:
            sample_labels.append(label)
            sample_pairs.append(pair)
            sample_recipes.append(recipe)

# Extract Features
features = extractor.extract_features(sample_pairs, sample_recipes)

# Training
cls = LinearSVMClassifier()
cls.train(sample_labels, features)

# Dev Testing
dev_labels = []
dev_pairs = []
dev_recipes = []
for name in dev_names:
    ing_path = os.path.join(DATA_ROOT, "dev", "ing_entity", name + ".ient")
    ins_path = os.path.join(DATA_ROOT, "dev", "instruct_entity",
                            name + ".entity")
    link_path = os.path.join(DATA_ROOT, "dev", "link", name + ".link")
    atts_path = os.path.join(DATA_ROOT, "dev", "atts", name + ".atts")
Exemple #14
0
 def set_data(self, datafile, ratingsfile):
     fe = FeatureExtractor(datafile)
     self.features = fe.extract_features()
     self.info = fe.extract_info()
     self.make_lookup_table()
     self.targets = create_targets(self.info, ratingsfile)
                            help='path to the positive corpus file')
    arg_parser.add_argument('corpus_file_neg',
                            help='path to the negative corpus file')
    arg_parser.add_argument('output_file', help='path to the output file')
    args = arg_parser.parse_args()

    print('\n - Autosarkasmus Baseline Feature Extraction (Simplified) -\n')

    # feature setup
    print('setting up features...')
    features, feature_order = setup_features()
    print('setting up feature extractor...')
    feature_extractor = FeatureExtractor(features, feature_order)

    # ARFF document setup
    arff_doc = ARFFDocument('Sarkasmuserkennung', features, feature_order)

    # the magic
    tweets_ext = feature_extractor.extract_features(args.corpus_file_pos,
                                                    args.corpus_file_neg,
                                                    verbose=True)

    # generate final ARFF document
    print('generating ARFF document...')
    for tweet_ext in tweets_ext:
        arff_doc.add_data(tweet_ext)
    arff_doc.generate_document(args.output_file)

    print('\n - extracted features from ' + str(len(arff_doc.data)) +
          ' tweets -\n')
Exemple #16
0
    def start_processing(cls, path_to_base_folder):

        start = time()

        Utilities.prepare_properties_dictionary()

        if not os.path.exists(
                os.path.join(
                    path_to_base_folder,
                    Utilities.get_prop_value(Utilities.BOOK_DESCRIPTOR_KEY))):
            print("Please provide the book descriptor file")
            return

        if not os.path.exists(
                os.path.join(path_to_base_folder,
                             Utilities.get_prop_value(
                                 Utilities.BOOK_REPO_KEY))):
            print("Please provide the book folder")
            return

        if not os.path.exists(
                os.path.join(
                    os.getcwd(),
                    Utilities.get_prop_value(Utilities.DATA_POINT_KEY))):
            data_start = time()
            DataPointSelector.select_datapoints(path_to_base_folder)
            data_end = time()

            print("Data Selection took : {} minutes".format(
                (data_end - data_start) / 60))
        else:
            print(
                "Data Point CSV found in directory, continuing to Feature Extraction"
            )

        if not os.path.exists(
                os.path.join(
                    os.getcwd(),
                    Utilities.get_prop_value(Utilities.PYTHON_FEATURE_CSV))):

            py_start = time()
            extractor = FeatureExtractor(
                base_folder_address=path_to_base_folder)
            extractor.extract_features()
            py_end = time()

            print("Python Extractor took : {} minutes".format(
                (py_end - py_start) / 60))
        else:
            print(
                "Python Feature Vector CSV found in directory, continuing to run Java project"
            )

        if not os.path.exists(
                os.path.join(
                    os.getcwd(),
                    Utilities.get_prop_value(Utilities.JAVA_FEATURE_CSV))):

            bat_file_name = r'command.bat'
            folder_path = os.path.join(
                path_to_base_folder,
                Utilities.get_prop_value(Utilities.BOOK_REPO_KEY))
            output_file_name = ".\\" + Utilities.get_prop_value(
                Utilities.JAVA_FEATURE_CSV)
            book_descriptor_file_name = ".\\" + Utilities.get_prop_value(
                Utilities.BOOK_DESCRIPTOR_KEY)
            data_points_file_name = ".\\" + Utilities.get_prop_value(
                Utilities.DATA_POINT_KEY)

            java_start = time()
            x = subprocess.call([
                bat_file_name, folder_path, output_file_name,
                book_descriptor_file_name, data_points_file_name
            ])
            java_end = time()

            print("Java Project took : {} minutes".format(
                (java_end - java_start) / 60))

        else:
            print(
                "Java output Feature Vector CSV found in directory, continuing to Model Runner"
            )

        runner = ModelRunner(path_to_base_folder)
        runner.drive_model_runner()

        end = time()
        total = end - start
        print("Total Time for the whole process : {} minutes".format(
            (end - start) / 60))