def normalizeDatabase(self):
        """
        If a collection has a field which contains several fields,
        we extract those fields and make a new collection out of it
        """
        n = Normalizer(self.metadata_db, self.dataset_db)

        # Bombs away!
        LOG.info("Processing mongodb dataset normalization")
        n.process()
        LOG.info("Finshing normalization")
    def normalizeDatabase(self):
        """
        If a collection has a field which contains several fields,
        we extract those fields and make a new collection out of it
        """
        n = Normalizer(self.metadata_db, self.dataset_db)

        # Bombs away!
        LOG.info("Processing mongodb dataset normalization")
        n.process()
        LOG.info("Finshing normalization")
Exemple #3
0
    def __init__(self,
                 n_states,
                 n_actions,
                 n_goals,
                 action_bounds,
                 capacity,
                 env,
                 k_future,
                 batch_size,
                 action_size=1,
                 tau=0.05,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 gamma=0.98):
        self.device = device("cpu")
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.k_future = k_future
        self.action_bounds = action_bounds
        self.action_size = action_size
        self.env = env

        self.actor = Actor(self.n_states,
                           n_actions=self.n_actions,
                           n_goals=self.n_goals).to(self.device)
        self.critic = Critic(self.n_states,
                             action_size=self.action_size,
                             n_goals=self.n_goals).to(self.device)
        self.sync_networks(self.actor)
        self.sync_networks(self.critic)
        self.actor_target = Actor(self.n_states,
                                  n_actions=self.n_actions,
                                  n_goals=self.n_goals).to(self.device)
        self.critic_target = Critic(self.n_states,
                                    action_size=self.action_size,
                                    n_goals=self.n_goals).to(self.device)
        self.init_target_networks()
        self.tau = tau
        self.gamma = gamma

        self.capacity = capacity
        self.memory = Memory(self.capacity, self.k_future, self.env)

        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_optim = Adam(self.actor.parameters(), self.actor_lr)
        self.critic_optim = Adam(self.critic.parameters(), self.critic_lr)

        self.state_normalizer = Normalizer(self.n_states[0],
                                           default_clip_range=5)
        self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)
Exemple #4
0
    def stoi(self, filepath, clean_filepath=None):
        # filepath = path to mashup
        # Needs octave and octave-signal installed
        # Use "pip install oct2py" to install python - octave bridge
        # STOI assumes
        # * a sampling rate of 10kHz, resamples otherwise
        # * window length of 384ms
        # * 15 third octave bands over full frequency range
        # * overlapping segments with hanning window
        # * removes silent frames
        import librosa
        from oct2py import octave
        if clean_filepath is None:
            # No clean file given.
            # Get processed and clean file from mashup.
            vocal_isolation = VocalIsolation(config)
            vocal_isolation.loadWeights(config.weights)
            audio, sampleRate = conversion.load_audio_file(filepath)
            spectrogram = conversion.audio_file_to_spectrogram(
                audio, fftWindowSize=config.fft,
                learn_phase=self.config.learn_phase)

            normalizer = Normalizer()
            normalize = normalizer.get(both=False)
            denormalize = normalizer.get_reverse()

            # normalize
            spectogram, norm = normalize(spectrogram)

            info = vocal_isolation.process_spectrogram(spectrogram,
                                                       config.get_channels())
            spectrogram, new_spectrogram = info
            # de-normalize
            new_spectrogram = denormalize(new_spectrogram, norm)

            processed = conversion.spectrogram_to_audio_file(new_spectrogram,
                                                             config.fft,
                                                             config.phase_iterations)

            clean_filepath = filepath.replace("_all.wav", "_vocal.wav")
            clean, sampling_rate = librosa.load(clean_filepath)
        else:
            # A clean file is given.
            # Compare it with the processed audio.
            processed, sampling_rate = librosa.load(filepath)
            clean, sampling_rate = librosa.load(clean_filepath)

        # Make sure the original and processed audio have the same length
        clean = clean[:processed.shape[0]]

        octave.eval("pkg load signal")
        d = octave.stoi(clean, processed, sampling_rate)
        self._write("stoi: %f" % d)
Exemple #5
0
    def __init__(self, **params):
        self.num_epochs = params["num_epochs"]
        self.early_stop_tolerance = params["early_stop_tolerance"]
        self.norm_method = params["norm_method"]
        self.loss_type = params["loss_type"]
        self.learning_rate = params["learning_rate"]
        self.l2_reg = params["l2_reg"]
        self.clip = params['clip']
        self.device = params['device']

        self.input_normalizer = Normalizer(self.norm_method)
        self.output_normalizer = Normalizer(self.norm_method)
Exemple #6
0
 def test_extract_section_features(self):
     normalizer = Normalizer()
     self.assertEqual(normalizer.extract_section_features('136'), generate_feature(d='136'))
     self.assertEqual(normalizer.extract_section_features('Reserve 40	'), generate_feature(pp="reserve", d='40'))
     self.assertEqual(normalizer.extract_section_features('Top Deck 6	'), generate_feature(pp="top deck", d='6'))
     self.assertEqual(normalizer.extract_section_features('31RS	'), generate_feature(d='31', s='rs'))
     self.assertEqual(normalizer.extract_section_features('Left Field Pavilion 311'),
                      generate_feature(pp='left field pavilion', d='311'))
     self.assertEqual(normalizer.extract_section_features('Infield Reserve IFR7	'),
                      generate_feature(pp='infield reserve', p='ifr', d='7'))
     self.assertEqual(normalizer.extract_section_features('311PL'), generate_feature(d='311', s='pl'))
     self.assertEqual(normalizer.extract_section_features('F9'), generate_feature(p='f', d='9'))
     self.assertEqual(normalizer.extract_section_features('L36'), generate_feature(p='l', d='36'))
Exemple #7
0
    def _prepare_normalizers(self):
        normalizer_tf = dict()
        with tf.variable_scope('o_stats'):
            normalizer_tf['o'] = Normalizer(self._env_spec['o_dim'],
                                            **self._normalizer_params)
        with tf.variable_scope('u_stats'):
            normalizer_tf['a'] = Normalizer(self._env_spec['a_dim'],
                                            **self._normalizer_params)
        with tf.variable_scope('abs_pred_err_stats'):
            normalizer_tf['abs_pred_err'] = Normalizer(
                self._env_spec['o_dim'], **self._normalizer_params)

        return normalizer_tf
Exemple #8
0
	def clean(self, tweets):
		for tw in tweets:
			count = 0
			for t in tweets[tw]:
				norm = Normalizer()
				stp = StpRemoval()
				t['text_clean'] = t['text'].encode('utf-8', errors='ignore')
				t['text_clean'] = t['text_clean'].translate(string.maketrans(string.punctuation, ' ' * len(string.punctuation)))
				text = norm.normalize(t['text_clean'])
				text = stp.removeStp(t['text_clean'])
				tweets[tw][count]['text_clean'] = text.lower()
				count = count + 1
		return tweets
 def reload(self):
     """Refreshes this instance's normalizers pool."""
     self.normalizers = { 'raw' : [], 'body' : [] }
     for path in self.iter_normalizer():
         norm = parse(open(path))
         if not self.dtd.validate(norm):
             warnings.warn('Skipping %s : invalid DTD' % path)
             print 'invalid normalizer ', path
         else:
             normalizer = Normalizer(norm, self.ctt)
             normalizer.uuid = self._compute_norm_uuid(normalizer)
             self.normalizers.setdefault(normalizer.appliedTo, [])
             self.normalizers[normalizer.appliedTo].append(normalizer)
     self.activate_normalizers()
 def reload(self):
     """Refreshes this instance's normalizers pool."""
     self.normalizers = {'raw': [], 'body': []}
     for path in self.iter_normalizer():
         norm = parse(open(path))
         if not self.dtd.validate(norm):
             warnings.warn('Skipping %s : invalid DTD' % path)
             print 'invalid normalizer ', path
         else:
             normalizer = Normalizer(norm, self.ctt, self.ccb)
             normalizer.uuid = self._compute_norm_uuid(normalizer)
             self.normalizers.setdefault(normalizer.appliedTo, [])
             self.normalizers[normalizer.appliedTo].append(normalizer)
     self.activate_normalizers()
Exemple #11
0
    def __init__(self,
                 env,
                 act_dim,
                 state_dim,
                 goal_dim,
                 act_range,
                 buffer_size=int(1e6),
                 gamma=0.98,
                 lr=0.001,
                 tau=0.95):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = state_dim + goal_dim
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.env = env

        # Create actor and critic networks
        self.actor_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        self.critic_network = Critic(self.env_dim, act_dim, act_range)
        self.critic_target_network = Critic(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # Optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=lr)

        # Replay buffer
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = ReplayMemory(buffer_size)

        # Normalizers
        self.goal_normalizer = Normalizer(
            goal_dim, default_clip_range=5)  # Clip between [-5, 5]
        self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
Exemple #12
0
    def train(self, batch_size, fold_idx, normalize=True, augment=False):
        """ Dataset for model training
        Args:
            batch_size: int, number of images in a batch
            fold_idx: int, index of fold, from 0 to n_folds - 1
            normalize: bool, wether to normalize training data
                                    with Welford's online algorithm.
            augment: bool, wether to use augmentation or not
        Returns:
            data: TensroFlow dataset
            steps: int, number of steps in train epoch
        """
        if not (fold_idx >= 0 and fold_idx < self.n_folds):
            raise Exception(('Fold index {} is out of expected range:' +
                             '  [0, {}]').format(fold_idx, self.n_folds - 1))

        if normalize and augment:
            raise Exception('Both augmentations and normalization ' +
                            'with Welford algo is not supported ')

        print(' ... Generating Training Dataset ... ')
        if self.n_folds == 1:
            train_idx = range(0, len(self.filenames))
        else:
            train_idx, _ = list(self.kf.split(self.filenames))[fold_idx]
        filenames = np.array(self.filenames)[train_idx]
        labels = np.array(self.labels)[train_idx]
        steps = math.ceil(len(filenames) / batch_size)
        if normalize:
            mean, std = Normalizer.calc_mean_and_std(filenames, self.img_size)
            mean = np.array([mean['red'], mean['green'], mean['blue']])
            std = np.array([std['red'], std['green'], std['blue']])
        else:
            # values taken from ImageNet Dataset
            mean = np.array([0.485, 0.456, 0.406])
            std = np.array([0.229, 0.224, 0.225])
        self.normalizer = Normalizer(mean, std)
        data = tf.data.Dataset.from_tensor_slices(
            (tf.constant(filenames), tf.constant(labels)))
        data = data.map(self.parse_fn)
        if augment:
            augs = [self.flip, self.color, self.rotate, self.zoom]
            for f in augs:
                data = data.map(f, num_parallel_calls=4)
            data = data.map(self.drop, num_parallel_calls=4)
        data = data.shuffle(buffer_size=len(filenames))
        data = data.batch(batch_size)
        data = data.prefetch(1)
        return data, steps
    def __init__(self,
                 hp=None,
                 input_size=None,
                 output_size=None,
                 normalizer=None,
                 policy=None,
                 monitor_dir=None):

        self.hp = hp or Hp()
        np.random.seed(self.hp.seed)
        self.env = gym.make(self.hp.env_name)
        if monitor_dir is not None:
            # Record periodic videos
            should_record = lambda i: self.record_video
            self.env = wrappers.Monitor(self.env,
                                        monitor_dir,
                                        video_callable=should_record,
                                        force=True)
        self.hp.episode_length = self.env.spec.timestep_limit or self.hp.episode_length
        self.input_size = input_size or self.env.observation_space.shape[0]
        self.output_size = output_size or self.env.action_space.shape[0]
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.policy = policy or Policy(self.input_size, self.output_size,
                                       self.hp)
        self.record_video = False
Exemple #14
0
class Preprocessor():
    def __init__(self, data_dir, start_index=0):
        self.normalizer = Normalizer()
        self.data_dir = data_dir
        self.docs_number = len(glob(os.path.join(self.data_dir, '**', '*.nxml')))
        self.inserter = Inserter()
        self.start_index = 0


    def preprocess(self):
        print('Start time: {0}'.format(datetime.now()))
        for index, file_name in enumerate(sorted(glob(os.path.join(self.data_dir, '**', '*.nxml')))):
            if index >= self.start_index:
                terms = self.normalizer.normalize(file_name)

                for term in set(terms):
                    self.inserter.insert(term, self.doc_id(file_name), terms.count(term))

                if index % 100 == 0:
                    print('processing doc {0}/{1}'.format(index + 1, self.docs_number))

        print('Fineished at: {0}'.format(datetime.now()))


    def doc_id(self, file_name):
        return os.path.splitext(os.path.basename(file_name))[0]
Exemple #15
0
    def __init__(self):
        self.__vocab: t.List[Replacement] = self.__make_vocab()
        self.__normalizer: Normalizer = Normalizer(self.__vocab)

        self.__string: str = ''
        self.__word: Word = Word()
        self.__normalized_word: Word = Word()
    def __init__(self, *args, **kwargs):
        """
        Constructor method, initializes variables.
        """

        # Initializing variables
        self.nlpnet_normalizer = Normalizer()
Exemple #17
0
class DataSource(object):
    def __init__(self, mode, data_path):
        self.normalizer = Normalizer()
        self.data_path = data_path
        self.mode = mode
        self.filenames = []
        self.contents  = []
        self.labels    = []

    def load_data(self):
        #   load raw sample
        raw_data = []
        raw_sample = ""
        with open(self.data_path, "r") as f:
            for row in f:
                if (self.mode + "_") in row:
                    raw_data.append(raw_sample)
                    raw_sample = row
                else:
                    raw_sample += row
        raw_data.append(raw_sample)
        raw_data = raw_data[1:]
        
        #   process raw data
        for i, raw_sample in enumerate(raw_data):
            self.filenames.append(raw_sample.split("\n")[0])
            print(i, end = "\r")
            content = re.search("\"(.*)\"", raw_sample, re.DOTALL).group(1)
            self.contents.append(self.normalizer.transform(content))
            if self.mode == "train":
                self.labels.append(raw_sample.split("\n")[-3])
Exemple #18
0
    def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'):
        self.normalizer = Normalizer()
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.connect((host, port))
        self.document_freqs_list_filename = freqs_file
        self.document_list_filename = list_file
        self.dataset_dir = dataset_dir

        self.documents_freqs = {}
        self.document_identificators = {}
        self.identificator_documents = {}

        with open(self.document_list_filename) as f:
            for line in f:
                data = line.strip().split()
                index = int(data[0])
                identifier = data[1]
                self.document_identificators[index] = identifier
                self.identificator_documents[identifier] = index

        self.documents_number = index

        with open(self.document_freqs_list_filename) as f:
            for line in f:
                data = line.strip().split()
                self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0])

        self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number)
        self.k1 = 2.0
        self.b = 0.75
    def prepare_data(self, chop, tracks, post_process=False):
        normalize = Normalizer().get()

        x = []
        y = []
        for track in tracks:
            x.append(self.mashup[track])
            if self.is_instrumental:
                y.append(self.instrumental[track])
            else:
                y.append(self.vocal[track])

        x = [self.prepare_spectrogram(s) for s in x]
        y = [self.prepare_spectrogram(s) for s in y]

        x, y = normalize(x, y)

        mashup_slices = []
        output_slices = []
        for mashup, output in zip(x, y):
            x_slices, y_slices = chop(mashup, output)

            x_slices = np.array(x_slices)[:]
            y_slices = np.array(y_slices)[:]
            mashup_slices.append(x_slices)
            output_slices.append(y_slices)
        return mashup_slices, output_slices
def main():
    numpy.random.seed(3)

    if not os.path.exists("cache"):
        os.makedirs("cache")

    if os.path.exists("cache/training_data.npy"):
        training_data = numpy.load('cache/training_data.npy')
    else:
        training_data = read_images(TRAIN['data'], DATA_FOLDER)
        numpy.save('cache/training_data', training_data)

    if os.path.exists("cache/training_labels.npy"):
        training_labels = numpy.load('cache/training_labels.npy')
    else:
        training_labels = read_labels(TRAIN['labels'], DATA_FOLDER)
        numpy.save('cache/training_labels', training_labels)

    training_data = training_data.reshape(training_data.shape[0], -1)

    number_of_inputs = training_data.shape[1]
    number_of_labels = numpy.unique(training_labels).size
    topology = [number_of_inputs, 50, 30, number_of_labels]

    if os.path.exists("cache/test_data.npy"):
        test_data = numpy.load('cache/test_data.npy')
    else:
        test_data = read_images(TEST['data'], DATA_FOLDER)
        numpy.save('cache/test_data', test_data)

    if os.path.exists("cache/test_labels.npy"):
        test_labels = numpy.load('cache/test_labels.npy')
    else:
        test_labels = read_labels(TEST['labels'], DATA_FOLDER)
        numpy.save('cache/test_labels', test_labels)

    test_data = test_data.reshape(test_data.shape[0], -1)

    normalized = Normalizer(training_data=training_data, test_data=test_data)

    for i in range(1):
        print("Running {}th time...".format(i))
        neural_network = NeuralNetwork(topology)
        neural_network.train(normalized.training_data(),
                             training_labels,
                             test_data=normalized.test_data(),
                             test_labels=test_labels)
Exemple #21
0
 def is_signature_real(cls, signature_image_name: str) -> bool:
     person_id = signature_image_name[-7:-4]
     standards_specifier = SignatureStandardsSpecifier(person_id=person_id)
     standards = standards_specifier.get_standards()
     pic = Normalizer.resize(
         standards.width, standards.height,
         cls.__get_normalized_image(signature_image_name))
     number_of_pixels = Detector.__count_black_pixels(pic)
     return standards.pixel_low_bound < number_of_pixels < standards.pixel_high_bound
Exemple #22
0
    def normalize_training_inputs(training_inputs):
        m_training_inputs_only = np.array(training_inputs)
        m_dimensions = m_training_inputs_only.transpose()
        m_dimensions_normalized = np.array([
            Normalizer.normalize_to_stdev(dimension_set)
            for dimension_set in m_dimensions
        ])
        m_training_inputs_normalized = m_dimensions_normalized.transpose()

        return m_training_inputs_normalized
    def reset(self):
        self.action_space = self.env.action_space
        obs_space = self.env.observation_space.spaces
        obs_len = obs_space['observation'].shape[0]
        goal_len = obs_space['desired_goal'].shape[0]
        self.state_size = obs_len + goal_len
        self.actions_size = self.action_space.shape[0]
        max_action = float(self.env.action_space.high[0])

        self.actor = ActorNet(self.state_size, *self.config['net_sizes'],
                              self.actions_size, max_action)
        self.critic = CriticNet(self.state_size, *self.config['net_sizes'],
                                self.actions_size)
        self.actor_target = ActorNet(self.state_size,
                                     *self.config['net_sizes'],
                                     self.actions_size, max_action)
        self.critic_target = CriticNet(self.state_size,
                                       *self.config['net_sizes'],
                                       self.actions_size)
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=self.config['learning_rate'])
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.config['learning_rate'])

        self.update(self.critic_target, self.critic, 1)
        self.update(self.actor_target, self.actor, 1)

        self.epsilon = self.config['epsilon']
        self.epsilon_decay = self.config['epsilon_decay']
        self.gamma = self.config['gamma']

        if self.config['PER']:
            self.memory = self.memory = PrioritizedMemory(
                self.config['memory_size'], self.config["memory_alpha"],
                self.config["memory_epsilon"], self.config["memory_beta"],
                self.config["memory_beta_increment"])
        else:
            self.memory = ReplayBuffer(self.config['memory_size'])

        self.batch_size = self.config['batch_size']
        self.normalizer = Normalizer(obs_len, goal_len)
        # warm up the normalizer
        self.normalizer.observe(self.env.reset())
class classifier(AffineModel):
    def __init__(self, **kwargs):
        self.normalizer = None
        AffineModel.__init__(self, **kwargs)

    def fit(self, data, *args, **kwargs):
        self.normalizer = Normalizer(data)
        return AffineModel.fit(self, data, *args, **kwargs)

    def score(self, data, *args, **kwargs):
        d = self.normalizer.normalize(data)
        return AffineModel.score(self, d, *args, **kwargs)
Exemple #25
0
    def __init__(self, parent=None, show=True):
        Qt.QMainWindow.__init__(self, parent)
        self.ds = reader.DataSet("")
        self.meshes = []
        self.plotter = BackgroundPlotter(shape=(1, 2),
                                         border_color='white',
                                         title="MMR Visualization")
        self.setWindowTitle('MMR UI')
        self.frame = Qt.QFrame()
        vlayout = Qt.QVBoxLayout()
        self.normalizer = Normalizer()
        self.frame.setLayout(vlayout)
        self.setCentralWidget(self.frame)
        mainMenu = self.menuBar()
        fileMenu = mainMenu.addMenu('File')
        exitButton = Qt.QAction('Exit', self)
        exitButton.setShortcut('Ctrl+Q')
        exitButton.triggered.connect(self.close)
        fileMenu.addAction(exitButton)
        meshMenu = mainMenu.addMenu('Mesh')

        self.load_mesh = Qt.QAction('Load mesh', self)
        self.load_mesh.triggered.connect(
            lambda: self.add_mesh(self.open_file_name_dialog()))
        meshMenu.addAction(self.load_mesh)

        self.show_norm_pipeline = Qt.QAction('Show norm pipeline', self)
        self.show_norm_pipeline.triggered.connect(
            lambda: self.show_processing(self.open_file_name_dialog()))
        meshMenu.addAction(self.show_norm_pipeline)

        self.extract_features = Qt.QAction('Extract features', self)
        self.extract_features.triggered.connect(lambda: print(
            FeatureExtractor.mono_run_pipeline(self.open_file_name_dialog())))
        meshMenu.addAction(self.extract_features)

        if show:
            self.show()
Exemple #26
0
    def create_network(self):
        # for actor network
        self.o_stats = Normalizer(size=self.dimo,
                                  eps=self.norm_eps,
                                  default_clip_range=self.norm_clip)
        if self.use_goal:
            self.g_stats = Normalizer(size=self.dimg,
                                      eps=self.norm_eps,
                                      default_clip_range=self.norm_clip)
        else:
            self.g_stats = None

        self.main = ActorCritic(self.o_stats, self.g_stats, self.input_dims,
                                self.use_goal).to(self.device)
        self.target = ActorCritic(self.o_stats, self.g_stats, self.input_dims,
                                  self.use_goal).to(self.device)
        self.target.actor = copy.deepcopy(self.main.actor)
        self.target.critic = copy.deepcopy(self.main.critic)

        self.actor_optimizer = optim.Adam(self.main.actor.parameters(),
                                          lr=self.pi_lr)
        self.critic_optimizer = optim.Adam(self.main.critic.parameters(),
                                           lr=self.Q_lr)
Exemple #27
0
def main(in_subt, out_subt):
    assert in_subt != ""
    assert out_subt != ""

    parser = Parser()
    normalizer = Normalizer()
    lemma_filter = Filter()

    try:
        f = codecs.open(in_subt, 'r', encoding='utf8')
        text = f.read()
        f.close()
    except IOError:
        sys.exit("The subtitle could not be found in the path you provided.")

    parser.parse(text)
    normalizer.normalize(parser.get_text())
    lemma_filter.clean_lemmas(normalizer.get_lemmas())

    new_sub = Subtitle(parser.get_indexes(), parser.get_times(),
                       parser.get_text(), lemma_filter.get_final_lemmas(),
                       lemma_filter.get_dict(), out_subt)
    new_sub.create_subtitle()
def check_data(manifest, sample):
    normalizer = Normalizer()
    normalizer.read_manifest(manifest)
    with open(sample, 'r') as f:
        sample_file = f.read().strip()
        if len(sample_file) > 1:
            for line in sample_file.split('\n')[1:]:
                # line in the manifest: section_id,section_name,row_id,row_name
                line = line.strip()
                elements = line.split(',')
                section_name = elements[0]
                row_name = elements[1]
                # [(section_id, row_id, bool), number of matches, different section ids we have seen, different row_id we have seen]
                # data example: [(None, None, False), 3, set([170, 11, 118]), set([7])]
                data = normalizer.normalize_raw(section_name, row_name)
                # We are looking for matches that result in `False` but are not caused by
                # database returning multiple entries or row_name value passed as a range
                count = 0
                if data[0][2] == False and data[1] <= 1 and '-' not in row_name:
                    count += 1
                    print data, section_name, row_name, row_name.isdigit()
            print 'BAD MATCH: ', count
            assert count == 0
    def process_spectrogram(self, spectrogram, channels=1):
        chopper = Chopper()
        chopper.name = "infer"
        chopper.params = "{'scale': %d}" % self.config.inference_slice
        chop = chopper.get(both=False)

        slices = chop(spectrogram)

        normalizer = Normalizer()
        normalize = normalizer.get(both=False)
        denormalize = normalizer.get_reverse()

        new_spectrogram = np.zeros((spectrogram.shape[0], 0, channels))
        for slice in slices:
            # normalize
            slice, norm = normalize(slice)

            epanded_spectrogram = conversion.expand_to_grid(
                slice, self.peakDownscaleFactor, channels)
            epanded_spectrogram_with_batch_and_channels = \
                epanded_spectrogram[np.newaxis, :, :]

            predicted_spectrogram_with_batch_and_channels = self.model.predict(
                epanded_spectrogram_with_batch_and_channels)
            # o /// o
            predicted_spectrogram = \
                predicted_spectrogram_with_batch_and_channels[0, :, :, :]
            local_spectrogram = predicted_spectrogram[:slice.shape[0], :slice.
                                                      shape[1], :]

            # de-normalize
            local_spectrogram = denormalize(local_spectrogram, norm)

            new_spectrogram = np.concatenate(
                (new_spectrogram, local_spectrogram), axis=1)
        console.log("Processed spectrogram")
        return spectrogram, new_spectrogram
Exemple #30
0
    def __init__(self, hparams):
        super(HER, self).__init__()

        self.hparams = hparams

        self.test_env = make_env(hparams, render=self.hparams.render_test)
        sample_obs = self.test_env.observation_space['observation'].sample()
        sample_goal = self.test_env.observation_space['achieved_goal'].sample()

        # HARD CODED VALUES FOR Bullet-HRL
        action_limits, state_limits = get_env_boundaries()
        action_offset, action_bounds, action_clip_low, action_clip_high = action_limits

        state_shape = sample_obs.shape[0]
        action_shape = self.test_env.action_space.shape[0]
        goal_shape = sample_goal.shape[0]
        self.action_clips = (action_clip_low, action_clip_high)

        self.model = DDPG(params=self.hparams,
                          obs_size=state_shape,
                          goal_size=goal_shape,
                          act_size=action_shape,
                          action_clips=(action_clip_low, action_clip_high),
                          action_bounds=action_bounds,
                          action_offset=action_offset)

        self.model.actor.share_memory()
        self.model.critic.share_memory()

        self.state_normalizer = Normalizer(
            state_shape, default_clip_range=self.hparams.clip_range)
        self.goal_normalizer = Normalizer(
            goal_shape, default_clip_range=self.hparams.clip_range)

        self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size,
                                                state_shape, action_shape,
                                                goal_shape)
Exemple #31
0
    def test_compare_to_scikit_learn_changing_test_size(self):
        normalizer = Normalizer(self.data)
        data = normalizer.normalize()

        for i in range(50, 130, 10):
            with self.subTest(i=i):
                testSize = i
                trainSize = len(data.data) - testSize

                print("test size: ", i)

                neighbours = 5

                trainData = {}
                testData = {}

                trainData['data'] = data.data[:trainSize]
                trainData['target'] = data.target[:trainSize]

                testData['data'] = data.data[trainSize:]
                testData['target'] = data.target[:trainSize]
                knn = KNN(trainData)

                #scikit-learn model:
                model = KNeighborsClassifier(n_neighbors=neighbours)
                model.fit(trainData['data'], trainData['target'])

                ourCounter = 0
                sciCounter = 0
                for i, e in enumerate(testData['data']):
                    if knn.makeGuess(e, neighbours) == testData['target'][i]:
                        ourCounter+=1

                    if model.predict([e]) == testData['target'][i]:
                        sciCounter+=1

                self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
Exemple #32
0
def create_app(test_config=None):
    app = Flask(__name__)
    app.config['JSON_AS_ASCII'] = False  # retrieve UTF-8 messages

    norm = Normalizer()

    @app.route('/reply', methods=['POST'])
    def reply():
        params = request.json
        if not params:
            return jsonify({
                "status":
                "error",
                "error":
                "Request must be of the application/json type!",
            })

        message = params.get("message")
        method = params.get("method")

        # Make sure the required params are present.
        if message is None or method is None:
            return jsonify({
                "status": "error",
                "error": "message and method are required keys"
            })

        methods = {
            'token': norm.tokenizer,
            'spell': norm.speller,
            'acronym': norm.acronym_searcher,
            'textese': norm.untextese,
            'proper_noun': norm.proper_noun_normalizer
        }

        try:
            reply = methods[method](message)
        except KeyError:
            return jsonify({
                "status":
                "error",
                "error":
                "method not valid, try one of the following: token, spell, acronym, textese or proper_noun"
            })

        # Send the response.
        return jsonify({"status": "ok", "reply": reply})

    return app
    def prepare_random_data(self, tracks, post_process=False):
        normalize = Normalizer().get()
        x = []
        y = []
        for track in tracks:
            x.append(self.mashup[track])
            if self.is_instrumental:
                y.append(self.instrumental[track])
            else:
                y.append(self.vocal[track])

        x = [self.prepare_spectrogram(s) for s in x]
        y = [self.prepare_spectrogram(s) for s in y]
        x, y = normalize(x, y)
        return x, y
Exemple #34
0
def trainRobotToWalk():
    from normalizer import Normalizer
    options = parse_config('teachingARobotToWalk.config')
    env = gym.make(options['ENV_NAME'])
    num_states, num_actions = env.observation_space.shape[
        0], env.action_space.shape[0]
    print('options:{}'.format(options))
    agent = AugmentedRandomSearch(num_states, num_actions, options)

    env = gym.wrappers.Monitor(env,
                               options['MONITOR_DIR'],
                               video_callable=agent.should_record_step,
                               force=True)
    normalizer = Normalizer(num_states)

    agent.train(env, normalizer,
                options['RECORD_EVERY'])  # training for our agent
Exemple #35
0
 def __init__(self,
              url,
              names,
              label_tag,
              drop_tags=None,
              encode_tags=None,
              normalizer=Normalizer(),
              normal_tags=None,
              test_size=0.2):
     self.url = url
     self.names = names
     self.drop_tags = drop_tags
     self.encode_tags = encode_tags
     self.data = None
     self.label_tag = label_tag
     self.test_size = test_size
     self.enc = ohe(categories='auto')
     self.normal_tags = normal_tags
     self.normalizer = normalizer
Exemple #36
0
    #print(res.url)
    #print(res.links)

pr = PageRank(resources)
print(pr.getCurrentRankRow())
print(pr.ranks)
pr.calculate()
print("THE RANKS")
print(pr.getCurrentRankRow())


for index, source in enumerate(resources):
    source.rank = pr.getCurrentRankRow()[index]

resources[0].extractText()
n = Normalizer(resources[0].extractText())
print(n.getTokens())
#for en in pr.end_nodes():
    #print(en.url)
#p = pr.backlinks_for(c.web_resources[0].url)
#print(c.web_resources[0].url)
#for i in p:
    #print(i.url)
#print("####")

#p = pr.parent_for(c.web_resources[1].url)
#print(c.web_resources[1].url)
#for i in p:
    #print(i.url)

Exemple #37
0
 def __init__(self, data_dir, start_index=0):
     self.normalizer = Normalizer()
     self.data_dir = data_dir
     self.docs_number = len(glob(os.path.join(self.data_dir, '**', '*.nxml')))
     self.inserter = Inserter()
     self.start_index = 0
class NLPNET:
    def __init__(self, *args, **kwargs):
        """
        Constructor method, initializes variables.
        """

        # Initializing variables
        self.nlpnet_normalizer = Normalizer()

    def tokenize(self, tokenize_string):
        """
        Returns the tokenized version of tokenize_string, which is just
        a normal English sentence.
        """

        # Setting up the nlpnet parser
        nlpnet.set_data_dir(self.get_data_dir_path())
        pos_parser = nlpnet.POSTagger()

        return pos_parser.tag(tokenize_string)

    def get_dependencies(self, dependency_string):
        """
        Returns dependency_string with sentence dependencies included.
        """

        nlpnet.set_data_dir(self.get_data_dir_path())
        dependency_parser = nlpnet.DependencyParser()

        return dependency_parser.parse(dependency_string)

    def get_data_dir_path(self):
        """
        Returns the directory of the nlpnet corpora.
        """

        # Getting nltk data path
        running = Popen(['python -c "import nltk;print nltk.data.path"'], stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
        stdin, stdout = running.communicate()

        # Setting the path that the nlpnet dependency was downloaded to
        path = re.sub(r"\'", "", re.sub(r"\[", '', str(stdin.split('\n')[0].split(',')[0])))
        path = path.split(r"/")
        path = '/'.join(path[: len(path) - 1]) + '/nlpnet_dependency/dependency'

        return path

    def use_nlpnet(self, base_string, test_string, pattern_arg):
        """
        Main interface method from the NLPNET class to the rest of
        the program.
        """

        # Setting up the nlpnet parser
        nlpnet.set_data_dir(self.get_data_dir_path())
        dependency_parser = nlpnet.DependencyParser()
        pos_parser = nlpnet.POSTagger()

        # Getting the passed patterns
        patterns = pattern_arg

        # Parsing the base_string
        base_parse = dependency_parser.parse(base_string)
        base_blob = TextBlob(base_string)
        base_sentences = base_blob.sentences
        base_sentence_info = []

        for index in range(0, len(base_parse)):
            # Grabbing sentence information
            raw_data = str(base_sentences[index])
            pos_sentence = pos_parser.tag(str(base_sentences[index]))
            subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(base_parse[index].tokens, base_parse[index].labels)

            """
            # Displaying information for debugging purposes
            #print "***BASE***"
            #print "Raw Sentence     : " + raw_data
            #print "POS Sentence    : " + str( pos_sentence )
            #print "[ Tokens ]       : " + str( base_parse[ index ].tokens )
            #print "[ Labels ]       : " + str( base_parse[ index ].labels )
            #print "[ Subject ]     : " + subject
            #print "[ Verb ]        : " + verb
            #print "[ Object ]      : " + object
            #print "[ Prep Phrases ] : " + str( prepositional_phrases )
            """

            # Deciding whether the sentence/pattern should be added
            add_sentence = True
            for sentence in base_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == raw_data:
                        add_sentence = False

                        break

            # If the sentence should be added to the possible patterns, add it
            if add_sentence:
                base_sentence_info.append([subject, verb, object, [], raw_data])

        # Parsing the test_string
        test_parse = dependency_parser.parse(test_string)
        test_blob = TextBlob(test_string)
        test_sentences = test_blob.sentences
        test_sentence_info = []

        for index in range(0, len(test_parse)):
            # Grabbing sentence information
            raw_data = str(test_sentences[index])
            pos_sentence = pos_parser.tag(str(test_sentences[index]))
            subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(test_parse[index].tokens, test_parse[index].labels)

            """
            #print "***TEST***"
            #print "Raw Sentence     : " + raw_data
            #print "POS Sentence    : " + str( pos_sentence )
            #print "[ Tokens ]       : " + str( test_parse[ index ].tokens )
            #print "[ Labels ]       : " + str( test_parse[ index ].labels )
            #print "[ Subject ]     : " + subject
            #print "[ Verb ]        : " + verb
            #print "[ Object ]      : " + object
            #print "[ Prep Phrases ] : " + str( prepositional_phrases )
            """

            # Deciding whether the sentence/pattern should be added
            add_sentence = True
            for sentence in test_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == raw_data:
                        add_sentence = False

                        break

            # If the sentence should be added to the possible patterns, add it
            if add_sentence:
                test_sentence_info.append([subject, verb, object, [], raw_data])

        # Returning the patterns found in the text
        return self.identify_common_patterns(base_sentence_info, test_sentence_info, patterns)

    def identify_sentence_parts_nlpnet(self, tokens, labels):
        subject = ""
        verb = ""
        object = ""
        prepositional_phrases = ""

        for index in range(0, len(labels)):
            if "SBJ" in labels[index] and verb == "":
                subject += tokens[index] + " "
            elif "ROOT" in labels[index]:
                verb += tokens[index]
            elif "PRD" in labels[index] or "OBJ" in labels[index]:
                object += tokens[index] + " "
            elif "LOC" in labels[index]:
                for prep_index in range(index, len(labels)):
                    if "PMOD" in labels[prep_index] and ' '.join(tokens[index : prep_index + 1]) not in prepositional_phrases:
                        prepositional_phrases += ' '.join(tokens[index : prep_index + 1]) + "..."

                        break

        return subject, verb, object, prepositional_phrases.split("...")

    def normalize_sentence_info(self, sentence_info):
        """
        Normalizes all of the incoming text to a standard.
        """

        # Normalizing text
        sentence_info = self.nlpnet_normalizer.normalize_sentence_info(sentence_info)

        # Return normalized information
        return sentence_info

    def identify_common_patterns(self, base_sentence_info, test_sentence_info, patterns):
        # Creating variables
        sentence_information = {}

        # Comparing the two sets of strings together & finding patterns
        for base_sentence in base_sentence_info:
            for test_sentence in test_sentence_info:
                # If there are two sentences/patterns to compare
                if base_sentence != [] and test_sentence != []:
                    # Normalize the pattern
                    normalized_base_sentence = self.normalize_sentence_info(base_sentence)
                    normalized_test_sentence = self.normalize_sentence_info(test_sentence)

                    # If the patterns' semantic "value" is the same
                    if normalized_base_sentence[0] == normalized_test_sentence[0] and normalized_base_sentence[1] == normalized_test_sentence[1] and normalized_base_sentence[2] == normalized_test_sentence[2]:
                        # If one sentence/pattern is longer than the other, use that pattern
                        if len(base_sentence[len(base_sentence) - 1].split()) > len(test_sentence[len(test_sentence) - 1].split()):
                            # If other patterns have been detected
                            if patterns != []:
                                sentence_information[base_sentence[len(base_sentence) - 1]] = base_sentence[: len(base_sentence) - 1]
                                sentence_information[base_sentence[len(base_sentence) - 1]].append(2)
                                sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

                                # If the current test patterns are not in patterns
                                if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns:
                                    patterns += [base_sentence[len(base_sentence) - 1]]

                                elif base_sentence[len(base_sentence) - 1] in patterns:
                                    # Updating reliability score
                                    try:
                                        sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1
                                    except:
                                        sentence_information[base_sentence[len(base_sentence) - 1]].append(2)
                            # If there are no patterns currently found, add this pattern
                            elif patterns == []:
                                patterns += [base_sentence[len(base_sentence) - 1]]

                                sentence_information[ base_sentence[len(base_sentence) - 1]] = base_sentence[0 : len(base_sentence) - 1]
                                # Updating reliability score
                                try:
                                    sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1
                                except:
                                    sentence_information[base_sentence[len(base_sentence) - 1]].append(2)

                                # Adding applicability score
                                try:
                                    sentence_information[base_sentence[len(base_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])
                                except:
                                    sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))
                        else:
                            # If there are patterns already found
                            if patterns != []:
                                sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[0 : len(test_sentence) - 1]
                                sentence_information[test_sentence[len(test_sentence) - 1]].append(2)
                                sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

                                # If the test patterns are not in the already found patterns
                                if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns:
                                    patterns += [test_sentence[len(test_sentence) - 1]]

                                    #sentence_information[ test_sentence[ len( test_sentence ) - 1 ] ] = test_sentence[ 0 : len( test_sentence ) - 1 ]
                                elif test_sentence[len(test_sentence) - 1] in patterns:
                                    # Updating reliability score
                                    try:
                                        sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1
                                    except:
                                        sentence_information[test_sentence[len(test_sentence) - 1]].append(2)
                            # If there are no patterns currently found
                            elif patterns == []:
                                patterns += [test_sentence[len(test_sentence) - 1]]

                                sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[: len(test_sentence) - 1]
                                # Updating reliability score
                                try:
                                    sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1
                                except:
                                    sentence_information[test_sentence[len(test_sentence) - 1]].append(2)

                                # Adding applicability score
                                try:
                                    sentence_information[test_sentence[len(test_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])
                                except:
                                    sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

        return patterns, sentence_information
Exemple #39
0
class Client():
    def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'):
        self.normalizer = Normalizer()
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.connect((host, port))
        self.document_freqs_list_filename = freqs_file
        self.document_list_filename = list_file
        self.dataset_dir = dataset_dir

        self.documents_freqs = {}
        self.document_identificators = {}
        self.identificator_documents = {}

        with open(self.document_list_filename) as f:
            for line in f:
                data = line.strip().split()
                index = int(data[0])
                identifier = data[1]
                self.document_identificators[index] = identifier
                self.identificator_documents[identifier] = index

        self.documents_number = index

        with open(self.document_freqs_list_filename) as f:
            for line in f:
                data = line.strip().split()
                self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0])

        self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number)
        self.k1 = 2.0
        self.b = 0.75


    def process_query(self, query):
        terms = self.normalizer.normalize_line(query)

        if not terms:
            return set()

        inverted_index = {}
        term_doc_tf = {}

        for term in terms:
            self.send_message(term)
            term_doc_tf[term] = self.parse_message_array(self.recieve_message().split())
            inverted_index[term] = term_doc_tf[term].keys()

        documents = self.merge(inverted_index, terms)

        if not documents:
            return set()

        return self.rank(documents, term_doc_tf, terms)


    def recieve_message(self):
        message = ''

        while not message.endswith('\n'):
            message += self.socket.recv(1024).decode('utf-8')

        return message.strip()


    def send_message(self, message):
        self.socket.send((message + '\n').encode('utf-8'))


    def parse_message_array(self, message_array):
        result = {}
        i = 0

        while i <= len(message_array) - 1:
            result[int(message_array[i])] = int(message_array[i + 1])
            i = i + 2

        return result


    def merge(self, inverted_index, terms):
        result = set(inverted_index[terms[0]])

        for i in range(1, len(terms)):
            result = result.intersection(set(inverted_index[terms[i]]))

        return result


    def rank(self, documents, term_doc_tf, terms):
        document_scores = {}
        document_freqs = {}

        for document in documents:
            document_scores[document] = self.score_document(document, term_doc_tf, terms)

        result = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
        best_match = result[0]
        best_match_file = open(os.path.join(self.dataset_dir, client.document_identificators[best_match[0]]), 'r')
        best_match_data = ' '.join(best_match_file.readlines())

        for term in terms:
            best_match_data = best_match_data.replace(term, color.RED + term + color.END)

        print(best_match_data)
        return result


    def score_document(self, document, term_doc_tf, terms):
        result = 0

        for term in terms:
            IDF = math.log((self.documents_number - len(term_doc_tf[term]) + 0.5) / (len(term_doc_tf[term]) + 0.5))
            f = term_doc_tf[term][document]

            result += IDF * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * self.documents_freqs[document] / self.avgdl))

        return result
for line in fin:
	line = line.strip()
	tweet.append(line)

#store in dataframe
prab_data = DataFrame(tweet)

#rename the column
prab_data.columns = ['tweet']

#score the tweet
score = []

#create normalizer object
norm = Normalizer()

#create Stop word Removal object
st = StpRemoval()

#create sentiment analysis object
s = Sentianal()

for i in range(0,len(prab_data)):
	#normalize
	line = norm.normalize(prab_data['tweet'][i])
	
	#remove stopword
	line = st.removeStp(line)

	#score sentiment
class PATTERN:
    def __init__(self, *args, **kwargs):
        """
        Constructor method, initializes variables.
        """

        # Initializing variables
        self.pattern_normalizer = Normalizer()

    def tokenize(self, tokenize_string):
        """
        Returns the tokenized version of tokenize_string,
        which is just a normal English sentence.
        """

        return parse(tokenize_string,
            tokenize = True,         # Split punctuation marks from words?
            tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
            chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
            relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
            lemmata = False,        # Parse lemmata? (ate => eat)
            encoding = 'utf-8',       # Input string encoding.
            tagset = None)

    def find_dependencies(self, dependency_string):
        """
        Returns dependency_string with sentence dependencies included.
        """

        return parse(dependency_string, relations=True)

    def use_pattern(self, base_string, test_string, pattern_arg):
        patterns = pattern_arg

        # Creating string textblob for analysis & analyzing the base_string's sentences
        base_blob = TextBlob(base_string)
        base_sentence_info = []

        for base_sentence in base_blob.sentences:
            subject = ""
            verb = ""
            object = ""
            prepositional_phrases = ""
            raw_data = parse(str(base_sentence), relations=True)

            for word in parse(str(base_sentence), relations=True).split():
                if "SBJ-" in word:
                    subject += re.sub(r'/.*', '', word) + " "
                elif "OBJ-" in word:
                    object += re.sub(r'/.*', '', word) + " "
                elif "VP-" in word:
                    verb += re.sub(r'/.*', '', word) + " "
                elif "PNP" in word:
                    prepositional_phrases += re.sub(r'/.*', '', word) + " "
                elif "PNP" not in word and prepositional_phrases[len(prepositional_phrases) - 3:] != "...":
                    prepositional_phrases += "..."

            """
            #print "[ Subject ]: " + subject
            #print "[ Object ]: " + object
            #print "[ Verb ]: " + verb
            #print "[ Prepositional Phrases ]: " + str( prepositional_phrases.split( '...' )[ 1:len(prepositional_phrases.split( '...' )) ] )
            #print "[ Raw Data ]: " + raw_data
            """

            add_sentence = True
            for sentence in base_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == str(base_sentence):
                        add_sentence = False

                        break

            if add_sentence:
                base_sentence_info.append( [subject, verb, object, prepositional_phrases.split('...')[1 : len(prepositional_phrases.split('...'))], str(base_sentence)])

        # Creating string textblob for analysis & analyzing the base_string's sentences
        test_blob = TextBlob(test_string)
        test_sentence_info = []

        for test_sentence in test_blob.sentences:
            subject = ""
            verb = ""
            object = ""
            prepositional_phrases = ""
            raw_data = parse(str(test_sentence), relations=True)

            for word in parse(str(test_sentence), relations=True).split():
                if "SBJ-" in word:
                    subject += re.sub(r'/.*', '', word) + " "
                elif "OBJ-" in word:
                    object += re.sub(r'/.*', '', word) + " "
                elif "VP-" in word:
                    verb += re.sub(r'/.*', '', word) + " "
                elif "PNP" in word:
                    prepositional_phrases += re.sub(r'/.*', '', word) + " "
                elif "PNP" not in word and prepositional_phrases[len(prepositional_phrases) - 3:] != "...":
                    prepositional_phrases += "..."

            """
            #print "[ Subject ]: " + subject
            #print "[ Object ]: " + object
            #print "[ Verb ]: " + verb
            #print "[ Prepositional Phrases ]: " + str( prepositional_phrases.split( '...' )[ 1:len(prepositional_phrases.split( '...' )) ] )
            #print "[ Raw Data ]: " + raw_data
            """

            add_sentence = True
            for sentence in test_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == str(test_sentence):
                        add_sentence = False

                        break

            if add_sentence:
                test_sentence_info.append([subject, verb, object, prepositional_phrases.split('...')[1 : len(prepositional_phrases.split('...'))], str(test_sentence)])

        return self.identify_common_patterns(base_sentence_info, test_sentence_info, patterns)

    def normalize_sentence_info(self, sentence_info):
        """
        Normalizes all of the incoming text to a standard.
        """

        # Normalizing text
        sentence_info = self.pattern_normalizer.normalize_sentence_info(sentence_info)

        # Return normalized information
        return sentence_info

    def identify_common_patterns(self, base_sentence_info, test_sentence_info, patterns):
        # Creating variables
        sentence_information = {}

        # Comparing the two sets of strings together & finding patterns
        for base_sentence in base_sentence_info:
            for test_sentence in test_sentence_info:
                # If there are two sentences/patterns to compare
                if base_sentence != [] and test_sentence != []:
                    # Normalize the pattern
                    normalized_base_sentence = self.normalize_sentence_info(base_sentence)
                    normalized_test_sentence = self.normalize_sentence_info(test_sentence)

                    # If the patterns' semantic "value" is the same
                    if normalized_base_sentence[0] == normalized_test_sentence[0] and normalized_base_sentence[1] == normalized_test_sentence[1] and normalized_base_sentence[2] == normalized_test_sentence[2]:
                        # If one sentence/pattern is longer than the other, use that pattern
                        if len(base_sentence[len(base_sentence) - 1].split()) > len(test_sentence[len(test_sentence) - 1].split()):
                            # If other patterns have been detected
                            if patterns != []:
                                sentence_information[base_sentence[len(base_sentence) - 1]] = base_sentence[: len(base_sentence) - 1]
                                sentence_information[base_sentence[len(base_sentence) - 1]].append(2)
                                sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

                                # If the current test patterns are not in patterns
                                if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns:
                                    patterns += [base_sentence[len(base_sentence) - 1]]

                                elif base_sentence[len(base_sentence) - 1] in patterns:
                                    # Updating reliability score
                                    try:
                                        sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1
                                    except:
                                        sentence_information[base_sentence[len(base_sentence) - 1]].append(2)
                            # If there are no patterns currently found, add this pattern
                            elif patterns == []:
                                patterns += [base_sentence[len(base_sentence) - 1]]

                                sentence_information[ base_sentence[len(base_sentence) - 1]] = base_sentence[0 : len(base_sentence) - 1]
                                # Updating reliability score
                                try:
                                    sentence_information[base_sentence[len(base_sentence) - 1]][4] += 1
                                except:
                                    sentence_information[base_sentence[len(base_sentence) - 1]].append(2)

                                # Adding applicability score
                                try:
                                    sentence_information[base_sentence[len(base_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])
                                except:
                                    sentence_information[base_sentence[len(base_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))
                        else:
                            # If there are patterns already found
                            if patterns != []:
                                sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[0 : len(test_sentence) - 1]
                                sentence_information[test_sentence[len(test_sentence) - 1]].append(2)
                                sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

                                # If the test patterns are not in the already found patterns
                                if test_sentence[len(test_sentence) - 1] not in patterns and base_sentence[len(base_sentence) - 1] not in patterns:
                                    patterns += [test_sentence[len(test_sentence) - 1]]

                                    #sentence_information[ test_sentence[ len( test_sentence ) - 1 ] ] = test_sentence[ 0 : len( test_sentence ) - 1 ]
                                elif test_sentence[len(test_sentence) - 1] in patterns:
                                    # Updating reliability score
                                    try:
                                        sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1
                                    except:
                                        sentence_information[test_sentence[len(test_sentence) - 1]].append(2)
                            # If there are no patterns currently found
                            elif patterns == []:
                                patterns += [test_sentence[len(test_sentence) - 1]]

                                sentence_information[test_sentence[len(test_sentence) - 1]] = test_sentence[: len(test_sentence) - 1]
                                # Updating reliability score
                                try:
                                    sentence_information[test_sentence[len(test_sentence) - 1]][4] += 1
                                except:
                                    sentence_information[test_sentence[len(test_sentence) - 1]].append(2)

                                # Adding applicability score
                                try:
                                    sentence_information[test_sentence[len(test_sentence) - 1]][5] = fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1])
                                except:
                                    sentence_information[test_sentence[len(test_sentence) - 1]].append(fuzz.ratio(base_sentence[len(base_sentence) - 1], test_sentence[len(test_sentence) - 1]))

        return patterns, sentence_information
Exemple #42
0
for line in fin:
	line = line.strip()
	tweet.append(line)

#store in dataframe
jkw_data = DataFrame(tweet)

#rename the column
jkw_data.columns = ['tweet']

#score the tweet
score = []

#create normalizer object
norm = Normalizer()

#create Stop word Removal object
st = StpRemoval()

#create sentiment analysis object
s = Sentianal()

for i in range(0,len(jkw_data)):
	#normalize
	line = norm.normalize(jkw_data['tweet'][i])
	
	#remove stopword
	line = st.removeStp(line)

	#score sentiment
Exemple #43
0
# Author : Alfan F. Wicaksono
# IR Lab, FASILKOM, UI

# Script for pre-processing twitter corpus

from normalizer import Normalizer
from stpremoval import StpRemoval

##################### you can modify this part ######################

corpusFile = "debatcapres_2014_sesi1.txt"
preprocessedFile = "debatcapres_2014_sesi1_processed.txt"

#####################################################################

nm = Normalizer()
sw = StpRemoval()

fin = open(corpusFile, "r")
fout = open(preprocessedFile, "w")

for line in fin:
    line = line.strip()  # remove carriage return
    line = nm.normalize(line)  # normalization
    line = sw.removeStp(line)  # remove stop word
    fout.write(line)  # put preprocessed tweet on the new file
    fout.write("\n")

fin.close()
fout.close()
 def fit(self, data, *args, **kwargs):
     self.normalizer = Normalizer(data)
     return AffineModel.fit(self, data, *args, **kwargs)