Example #1
0
 def get_dominant(self):
     """
     ensures that we don't throw out too dark, or light of images
     which are usually in the background of photos. This comes in
     handy when images are transparent or are a logo
     """
     stop = False
     while stop == False:
         vecs, dist = vq(self.ar, self.codes)  # assign codes
         counts, bins = histogram(vecs,
                                  len(self.codes))  # count occurrences
         index_max = argmax(counts)
         peak = self.codes[index_max]
         lumin = self.get_luminosity(peak)
         if lumin > 200.0 or lumin < 35.0:  # too light, too dark
             self.codes = np_delete(self.codes, (index_max),
                                    axis=0)  #retry without that instance
         else:
             stop = True
             rgb = ''.join(
                 chr(int(c)) for index, c in enumerate(peak)
                 if not index == 3)
             dom_hex = "#%s" % rgb.encode('hex')
             rgb = tuple(peak[:3])
     return dom_hex, rgb
Example #2
0
    def remove(self, range_to_remove: Tuple[int, int]) -> 'NoteSequence':
        assert len(range_to_remove) == 2
        validate_sequence_of_type('range_to_remove', range_to_remove, int)
        # noinspection PyTupleAssignmentBalance
        range_start, range_end = range_to_remove
        self.note_attr_vals = np_delete(self.note_attr_vals, range(range_start, range_end), axis=0)

        self.update_range_map()
        return self
Example #3
0
 def reduceIndicies(self, deadRowIndicies):
     """purge indices from the data structures
     
     Be sure that deadRowIndicies are sorted ascending
     """
     # strip out the other values        
     self.indices = np_delete(self.indices, deadRowIndicies, axis=0)
     self.covProfiles = np_delete(self.covProfiles, deadRowIndicies, axis=0)
     self.transformedCP = np_delete(self.transformedCP, deadRowIndicies, axis=0)
     self.contigNames = np_delete(self.contigNames, deadRowIndicies, axis=0)
     self.contigLengths = np_delete(self.contigLengths, deadRowIndicies, axis=0)
     self.contigColours = np_delete(self.contigColours, deadRowIndicies, axis=0)
     self.kmerSigs = np_delete(self.kmerSigs, deadRowIndicies, axis=0)
     self.kmerVals = np_delete(self.kmerVals, deadRowIndicies, axis=0)
     self.binIds = np_delete(self.binIds, deadRowIndicies, axis=0)
Example #4
0
 def _filteringAnomaly(data, func):
     '''
     过滤噪点, 异常值
     :param data:
     :param list:
     :return:
     '''
     if data.max() - data.min() <= 2: return data
     index = func(data)
     data = np_delete(data, index)
     return data
Example #5
0
def remove_same_rows(n_pos, X_neg, X_pos, neg_comp_list):
    # Removing negative feature rows that exactly match any row in positives
    cout = 0
    for ind in range(n_pos):
        matching_inds = np_where((X_neg == X_pos[ind]).all(axis=1))
        X_neg = np_delete(X_neg, matching_inds, axis=0)
        for index in sorted(list(matching_inds[0]), reverse=True):
            cout += 1
            del neg_comp_list[index]
    print("No. of negs removed due to same feature vector = ", cout)
    n_neg = len(X_neg)
    return X_neg, neg_comp_list, n_neg
Example #6
0
    def reduceIndices(self, deadRowIndices):
        """purge indices from the data structures

        Be sure that deadRowIndices are sorted ascending
        """
        # strip out the other values
        self.indices = np_delete(self.indices, deadRowIndices, axis=0)
        self.covProfiles = np_delete(self.covProfiles, deadRowIndices, axis=0)
        self.transformedCP = np_delete(self.transformedCP, deadRowIndices, axis=0)
        self.contigNames = np_delete(self.contigNames, deadRowIndices, axis=0)
        self.contigLengths = np_delete(self.contigLengths, deadRowIndices, axis=0)
        self.contigGCs = np_delete(self.contigGCs, deadRowIndices, axis=0)
        #self.kmerSigs = np_delete(self.kmerSigs, deadRowIndices, axis=0)
        self.kmerPCs = np_delete(self.kmerPCs, deadRowIndices, axis=0)
        self.binIds = np_delete(self.binIds, deadRowIndices, axis=0)
 def get_dominant(self):
     """
     ensures that we don't throw out too dark, or light of images
     which are usually in the background of photos. This comes in
     handy when images are transparent or are a logo
     """
     stop = False
     while stop == False:
         vecs, dist = vq(self.ar, self.codes) # assign codes
         counts, bins = histogram(vecs, len(self.codes))  # count occurrences
         index_max = argmax(counts)
         peak = self.codes[index_max]
         lumin = self.get_luminosity(peak)
         if lumin > 200.0 or lumin < 35.0: # too light, too dark
             self.codes = np_delete(self.codes,(index_max), axis=0)  #retry without that instance
         else:
             stop = True
             rgb = ''.join(chr(int(c)) for index, c in enumerate(peak) if not index == 3)
             dom_hex = "#%s" % rgb.encode('hex')
             rgb = tuple(peak[:3])
     return dom_hex, rgb
Example #8
0
    def _make_fully_connections(neuron,
                                previous_layer,
                                random_state,
                                maximum_neuron_connection_weight,
                                prob_skip_connection=0.0,
                                neurons_for_skip_connections=None):
        """Connects a neuron with all other neurons from a given neural network's layer.

        Parameters
        ----------
        neuron : Neuron
            Neuron to connect with a previous layer.

        previous_layer : array of shape (num_neurons,)
            Set of neurons that can be connected or not with 'neuron'.

        random_state : RandomState instance
                    A Numpy random number generator.

        maximum_neuron_connection_weight : float
            Maximum value a weight connection between two neurons can have.

        prob_skip_connection : float, optional, default 0.0
            Probability of the neurons on hidden layers establish a connection with another
            neuron of a layer other than the previous one.

        neurons_for_skip_connections : array of shape (num_neurons,), optional, default None
            Set of neurons from previous layers (except from neural network's
            immediately preceding layer) that can be connected or not
            with 'neuron'. When None, it means that we are either connecting the
            first hidden layer to the input layer, or skipping connections are not
            being performed.
        """
        # Verify if skip connections can occur:
        if neurons_for_skip_connections:
            # Make safety copy of previous layer:
            for i in range(len(previous_layer)):
                generated_prob_skip_connection = random_state.uniform()

                if generated_prob_skip_connection <= prob_skip_connection:
                    # Get random index for previous_neuron:
                    previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id(
                        random_state, 0, len(neurons_for_skip_connections))
                    previous_neuron = neurons_for_skip_connections[
                        previous_neuron_id]
                    # Remove previous neuron from neurons_for_skip_connections to avoid future duplicated connections:
                    neurons_for_skip_connections = np_delete(
                        neurons_for_skip_connections, previous_neuron_id)
                    # Set flag regarding previous_layer:
                    is_from_previous_layer = False
                else:
                    previous_neuron = previous_layer[i]
                    # Set flag regarding previous_layer:
                    is_from_previous_layer = True
                NeuralNetworkBuilder._connect_neurons(
                    previous_neuron, neuron,
                    random_state.uniform(-maximum_neuron_connection_weight,
                                         maximum_neuron_connection_weight),
                    True, is_from_previous_layer)

        # There are no skip connections to be taken into account:
        else:
            [
                NeuralNetworkBuilder._connect_neurons(
                    previous_neuron, neuron,
                    random_state.uniform(-maximum_neuron_connection_weight,
                                         maximum_neuron_connection_weight),
                    True, True) for previous_neuron in previous_layer
            ]
Example #9
0
    def _make_sparse_connections(neuron,
                                 previous_layer,
                                 random_state,
                                 maximum_neuron_connection_weight,
                                 sparseness,
                                 neurons_for_skip_connections=None):
        """Sparse connections mean that a given neuron is able to not set some
        connections for a given layer.

        Parameters
        ----------
        neuron : Neuron
            Neuron to connect with a previous layer.

        previous_layer : array of shape (num_neurons,)
            Set of neurons that can be connected or not with 'neuron'.

        maximum_neuron_connection_weight : float
            Maximum value a weight connection between two neurons can have.

        random_state : RandomState instance
            A Numpy random number generator.

        sparseness : dict
            Dictionary containing information regarding neurons' connections, namely
            sparseness and the existence of skip connections (keys: 'sparse',
            'minimum_sparseness', 'maximum_sparseness', and 'prob_skip_connection').

        neurons_for_skip_connections : array of shape (num_neurons,), optional, default None
            Set of neurons from previous layers (except from neural network's
            immediately preceding layer) that can be connected or not
            with 'neuron'. When None, it means that we are either connecting the
            first hidden layer to the input layer, or skipping connections are not
            being performed.
        """
        # Get neuron's sparseness, i.e., from the total number of possible connections define randomly the number of connections to activate:
        proportion_sparse_connections = random_state.uniform(
            sparseness.get('minimum_sparseness'),
            sparseness.get('maximum_sparseness'))
        # Note: it's necessary to guarantee at least 1 connection
        num_connections = max(
            round(len(previous_layer) * (1 - proportion_sparse_connections)),
            1)
        # Get neurons of previous layer to connect with current neuron (sample without replacement):
        neurons_to_connect = random_state.choice(previous_layer,
                                                 num_connections,
                                                 replace=False)

        #=======================================================================
        # if neurons_for_skip_connections:
        #     print('len(neurons_for_skip_connections) vs. len(neurons_to_connect) =', len(neurons_for_skip_connections), ' vs.', len(neurons_to_connect))
        #=======================================================================

        # Connect neurons:
        if neurons_for_skip_connections:
            for _ in range(num_connections):
                prob_skip_connection = random_state.uniform()

                #===============================================================
                # if len(neurons_for_skip_connections) == 0:
                #     print('\t\t\t[Debug] len(neurons_for_skip_connections) == 0')
                #===============================================================
                if len(neurons_to_connect) == 0:
                    print('\t\t\t[Debug] len(neurons_to_connect) == 0')

                if len(neurons_for_skip_connections
                       ) > 0 and prob_skip_connection <= sparseness.get(
                           'prob_skip_connection'):
                    # Get random index for previous_neuron:
                    previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id(
                        random_state, 0, len(neurons_for_skip_connections))
                    # Collect neuron and remove it from the list to avoid future duplicate connections:
                    previous_neuron = neurons_for_skip_connections[
                        previous_neuron_id]
                    neurons_for_skip_connections = np_delete(
                        neurons_for_skip_connections, previous_neuron_id)
                    # Set flag regarding previous_layer:
                    is_from_previous_layer = False
                else:
                    # Get random index for previous_neuron:
                    previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id(
                        random_state, 0, len(neurons_to_connect))
                    # Collect neuron and remove it from the list to avoid future duplicate connections:
                    previous_neuron = neurons_to_connect[previous_neuron_id]
                    neurons_to_connect = np_delete(neurons_to_connect,
                                                   previous_neuron_id)
                    # Set flag regarding previous_layer:
                    is_from_previous_layer = True

                # Connect the two neurons:
                NeuralNetworkBuilder._connect_neurons(
                    previous_neuron, neuron,
                    random_state.uniform(-maximum_neuron_connection_weight,
                                         maximum_neuron_connection_weight),
                    True, is_from_previous_layer)
        else:  # There are no connections to be skipped:
            [
                NeuralNetworkBuilder._connect_neurons(
                    previous_neuron, neuron,
                    random_state.uniform(-maximum_neuron_connection_weight,
                                         maximum_neuron_connection_weight),
                    True, True) for previous_neuron in neurons_to_connect
            ]
Example #10
0
def tfidf_filter(cur_dict: Dict[str, dict],
                 prev_dict: Optional[Dict[str, dict]] = None,
                 max_similarity: float = 0.75):
    """ Fit a TFIDF vectorizer to a corpus of all listing's text

        Args:
            cur_dict: today's job scrape dict
            prev_dict: the existing master list job dict
            max_similarity: threshold above which blurb similarity = duplicate

        Returns:
            list of duplicate job ids which were removed from cur_dict
    """
    # Retrieve stopwords if not already downloaded.
    try:
        stopwords = nltk.corpus.stopwords.words('english')
    except LookupError:
        try:
            nltk.download('stopwords', quiet=True)
            stopwords = nltk.corpus.stopwords.words('english')
        except Exception as e:
            print(e)

    # init vectorizer
    vectorizer = TfidfVectorizer(strip_accents='unicode',
                                 lowercase=True,
                                 analyzer='word',
                                 stop_words=stopwords)
    # init list to store duplicate ids:
    duplicate_ids = {}

    if prev_dict is None:
        # get query words and ids as lists
        query_ids = [job['id'] for job in cur_dict.values()]
        query_words = [job['blurb'] for job in cur_dict.values()]

        # Returns cosine similarity between jobs as square matrix (n,n)
        similarities = cosine_similarity(vectorizer.fit_transform(query_words))
        # Fills diagonals with 0, so whole dict does not get popped
        fill_diagonal(similarities, 0)
        # init index
        index = 0
        # Identifies duplicates and stores them in duplicate_ids dictionary
        while True:
            # Loop breaks when index is equal to matrix height
            if index == len(similarities):
                break

            # Deletes row and column, every time a max is found for a job id.
            if np_max(similarities[index]) >= max_similarity:
                # Query ids are popped so index always matches correct element.
                duplicate_ids.update(
                    {query_ids[index]: cur_dict.pop(query_ids.pop(index))})
                # Reduce matrix dimensions, (n-1, n-1)
                similarities = np_delete(similarities, index, axis=0)
                similarities = np_delete(similarities, index, axis=1)

            else:  # Increment index by one
                index += 1
        # log something
        logging.info(f'Found and removed {len(duplicate_ids.keys())} '
                     f're-posts/duplicates via TFIDF cosine similarity!')

    else:
        # Checks current scrape for re-posts/duplicates
        duplicate_ids = tfidf_filter(cur_dict)

        # get query words and ids as lists
        query_ids = [job['id'] for job in cur_dict.values()]
        query_words = [job['blurb'] for job in cur_dict.values()]

        # get reference words as list
        reference_words = [job['blurb'] for job in prev_dict.values()]

        # fit vectorizer to entire corpus
        vectorizer.fit(query_words + reference_words)

        # set reference tfidf for cosine similarity later
        references = vectorizer.transform(reference_words)

        # calculate cosine similarity between reference and current blurbs
        similarities = cosine_similarity(vectorizer.transform(query_words),
                                         references)

        # get duplicate job ids and pop them
        for sim, query_id in zip(similarities, query_ids):
            if np_max(sim) >= max_similarity:
                duplicate_ids.update({query_id: cur_dict.pop(query_id)})

        # log something
        logging.info(f'found {len(cur_dict.keys())} unique listings and '
                     f'{len(duplicate_ids.keys())} duplicates '
                     f'via TFIDF cosine similarity')

    # Returns a dictionary of duplicates
    return duplicate_ids