def get_dominant(self): """ ensures that we don't throw out too dark, or light of images which are usually in the background of photos. This comes in handy when images are transparent or are a logo """ stop = False while stop == False: vecs, dist = vq(self.ar, self.codes) # assign codes counts, bins = histogram(vecs, len(self.codes)) # count occurrences index_max = argmax(counts) peak = self.codes[index_max] lumin = self.get_luminosity(peak) if lumin > 200.0 or lumin < 35.0: # too light, too dark self.codes = np_delete(self.codes, (index_max), axis=0) #retry without that instance else: stop = True rgb = ''.join( chr(int(c)) for index, c in enumerate(peak) if not index == 3) dom_hex = "#%s" % rgb.encode('hex') rgb = tuple(peak[:3]) return dom_hex, rgb
def remove(self, range_to_remove: Tuple[int, int]) -> 'NoteSequence': assert len(range_to_remove) == 2 validate_sequence_of_type('range_to_remove', range_to_remove, int) # noinspection PyTupleAssignmentBalance range_start, range_end = range_to_remove self.note_attr_vals = np_delete(self.note_attr_vals, range(range_start, range_end), axis=0) self.update_range_map() return self
def reduceIndicies(self, deadRowIndicies): """purge indices from the data structures Be sure that deadRowIndicies are sorted ascending """ # strip out the other values self.indices = np_delete(self.indices, deadRowIndicies, axis=0) self.covProfiles = np_delete(self.covProfiles, deadRowIndicies, axis=0) self.transformedCP = np_delete(self.transformedCP, deadRowIndicies, axis=0) self.contigNames = np_delete(self.contigNames, deadRowIndicies, axis=0) self.contigLengths = np_delete(self.contigLengths, deadRowIndicies, axis=0) self.contigColours = np_delete(self.contigColours, deadRowIndicies, axis=0) self.kmerSigs = np_delete(self.kmerSigs, deadRowIndicies, axis=0) self.kmerVals = np_delete(self.kmerVals, deadRowIndicies, axis=0) self.binIds = np_delete(self.binIds, deadRowIndicies, axis=0)
def _filteringAnomaly(data, func): ''' 过滤噪点, 异常值 :param data: :param list: :return: ''' if data.max() - data.min() <= 2: return data index = func(data) data = np_delete(data, index) return data
def remove_same_rows(n_pos, X_neg, X_pos, neg_comp_list): # Removing negative feature rows that exactly match any row in positives cout = 0 for ind in range(n_pos): matching_inds = np_where((X_neg == X_pos[ind]).all(axis=1)) X_neg = np_delete(X_neg, matching_inds, axis=0) for index in sorted(list(matching_inds[0]), reverse=True): cout += 1 del neg_comp_list[index] print("No. of negs removed due to same feature vector = ", cout) n_neg = len(X_neg) return X_neg, neg_comp_list, n_neg
def reduceIndices(self, deadRowIndices): """purge indices from the data structures Be sure that deadRowIndices are sorted ascending """ # strip out the other values self.indices = np_delete(self.indices, deadRowIndices, axis=0) self.covProfiles = np_delete(self.covProfiles, deadRowIndices, axis=0) self.transformedCP = np_delete(self.transformedCP, deadRowIndices, axis=0) self.contigNames = np_delete(self.contigNames, deadRowIndices, axis=0) self.contigLengths = np_delete(self.contigLengths, deadRowIndices, axis=0) self.contigGCs = np_delete(self.contigGCs, deadRowIndices, axis=0) #self.kmerSigs = np_delete(self.kmerSigs, deadRowIndices, axis=0) self.kmerPCs = np_delete(self.kmerPCs, deadRowIndices, axis=0) self.binIds = np_delete(self.binIds, deadRowIndices, axis=0)
def get_dominant(self): """ ensures that we don't throw out too dark, or light of images which are usually in the background of photos. This comes in handy when images are transparent or are a logo """ stop = False while stop == False: vecs, dist = vq(self.ar, self.codes) # assign codes counts, bins = histogram(vecs, len(self.codes)) # count occurrences index_max = argmax(counts) peak = self.codes[index_max] lumin = self.get_luminosity(peak) if lumin > 200.0 or lumin < 35.0: # too light, too dark self.codes = np_delete(self.codes,(index_max), axis=0) #retry without that instance else: stop = True rgb = ''.join(chr(int(c)) for index, c in enumerate(peak) if not index == 3) dom_hex = "#%s" % rgb.encode('hex') rgb = tuple(peak[:3]) return dom_hex, rgb
def _make_fully_connections(neuron, previous_layer, random_state, maximum_neuron_connection_weight, prob_skip_connection=0.0, neurons_for_skip_connections=None): """Connects a neuron with all other neurons from a given neural network's layer. Parameters ---------- neuron : Neuron Neuron to connect with a previous layer. previous_layer : array of shape (num_neurons,) Set of neurons that can be connected or not with 'neuron'. random_state : RandomState instance A Numpy random number generator. maximum_neuron_connection_weight : float Maximum value a weight connection between two neurons can have. prob_skip_connection : float, optional, default 0.0 Probability of the neurons on hidden layers establish a connection with another neuron of a layer other than the previous one. neurons_for_skip_connections : array of shape (num_neurons,), optional, default None Set of neurons from previous layers (except from neural network's immediately preceding layer) that can be connected or not with 'neuron'. When None, it means that we are either connecting the first hidden layer to the input layer, or skipping connections are not being performed. """ # Verify if skip connections can occur: if neurons_for_skip_connections: # Make safety copy of previous layer: for i in range(len(previous_layer)): generated_prob_skip_connection = random_state.uniform() if generated_prob_skip_connection <= prob_skip_connection: # Get random index for previous_neuron: previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id( random_state, 0, len(neurons_for_skip_connections)) previous_neuron = neurons_for_skip_connections[ previous_neuron_id] # Remove previous neuron from neurons_for_skip_connections to avoid future duplicated connections: neurons_for_skip_connections = np_delete( neurons_for_skip_connections, previous_neuron_id) # Set flag regarding previous_layer: is_from_previous_layer = False else: previous_neuron = previous_layer[i] # Set flag regarding previous_layer: is_from_previous_layer = True NeuralNetworkBuilder._connect_neurons( previous_neuron, neuron, random_state.uniform(-maximum_neuron_connection_weight, maximum_neuron_connection_weight), True, is_from_previous_layer) # There are no skip connections to be taken into account: else: [ NeuralNetworkBuilder._connect_neurons( previous_neuron, neuron, random_state.uniform(-maximum_neuron_connection_weight, maximum_neuron_connection_weight), True, True) for previous_neuron in previous_layer ]
def _make_sparse_connections(neuron, previous_layer, random_state, maximum_neuron_connection_weight, sparseness, neurons_for_skip_connections=None): """Sparse connections mean that a given neuron is able to not set some connections for a given layer. Parameters ---------- neuron : Neuron Neuron to connect with a previous layer. previous_layer : array of shape (num_neurons,) Set of neurons that can be connected or not with 'neuron'. maximum_neuron_connection_weight : float Maximum value a weight connection between two neurons can have. random_state : RandomState instance A Numpy random number generator. sparseness : dict Dictionary containing information regarding neurons' connections, namely sparseness and the existence of skip connections (keys: 'sparse', 'minimum_sparseness', 'maximum_sparseness', and 'prob_skip_connection'). neurons_for_skip_connections : array of shape (num_neurons,), optional, default None Set of neurons from previous layers (except from neural network's immediately preceding layer) that can be connected or not with 'neuron'. When None, it means that we are either connecting the first hidden layer to the input layer, or skipping connections are not being performed. """ # Get neuron's sparseness, i.e., from the total number of possible connections define randomly the number of connections to activate: proportion_sparse_connections = random_state.uniform( sparseness.get('minimum_sparseness'), sparseness.get('maximum_sparseness')) # Note: it's necessary to guarantee at least 1 connection num_connections = max( round(len(previous_layer) * (1 - proportion_sparse_connections)), 1) # Get neurons of previous layer to connect with current neuron (sample without replacement): neurons_to_connect = random_state.choice(previous_layer, num_connections, replace=False) #======================================================================= # if neurons_for_skip_connections: # print('len(neurons_for_skip_connections) vs. len(neurons_to_connect) =', len(neurons_for_skip_connections), ' vs.', len(neurons_to_connect)) #======================================================================= # Connect neurons: if neurons_for_skip_connections: for _ in range(num_connections): prob_skip_connection = random_state.uniform() #=============================================================== # if len(neurons_for_skip_connections) == 0: # print('\t\t\t[Debug] len(neurons_for_skip_connections) == 0') #=============================================================== if len(neurons_to_connect) == 0: print('\t\t\t[Debug] len(neurons_to_connect) == 0') if len(neurons_for_skip_connections ) > 0 and prob_skip_connection <= sparseness.get( 'prob_skip_connection'): # Get random index for previous_neuron: previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id( random_state, 0, len(neurons_for_skip_connections)) # Collect neuron and remove it from the list to avoid future duplicate connections: previous_neuron = neurons_for_skip_connections[ previous_neuron_id] neurons_for_skip_connections = np_delete( neurons_for_skip_connections, previous_neuron_id) # Set flag regarding previous_layer: is_from_previous_layer = False else: # Get random index for previous_neuron: previous_neuron_id = NeuralNetworkBuilder._raffle_neuron_id( random_state, 0, len(neurons_to_connect)) # Collect neuron and remove it from the list to avoid future duplicate connections: previous_neuron = neurons_to_connect[previous_neuron_id] neurons_to_connect = np_delete(neurons_to_connect, previous_neuron_id) # Set flag regarding previous_layer: is_from_previous_layer = True # Connect the two neurons: NeuralNetworkBuilder._connect_neurons( previous_neuron, neuron, random_state.uniform(-maximum_neuron_connection_weight, maximum_neuron_connection_weight), True, is_from_previous_layer) else: # There are no connections to be skipped: [ NeuralNetworkBuilder._connect_neurons( previous_neuron, neuron, random_state.uniform(-maximum_neuron_connection_weight, maximum_neuron_connection_weight), True, True) for previous_neuron in neurons_to_connect ]
def tfidf_filter(cur_dict: Dict[str, dict], prev_dict: Optional[Dict[str, dict]] = None, max_similarity: float = 0.75): """ Fit a TFIDF vectorizer to a corpus of all listing's text Args: cur_dict: today's job scrape dict prev_dict: the existing master list job dict max_similarity: threshold above which blurb similarity = duplicate Returns: list of duplicate job ids which were removed from cur_dict """ # Retrieve stopwords if not already downloaded. try: stopwords = nltk.corpus.stopwords.words('english') except LookupError: try: nltk.download('stopwords', quiet=True) stopwords = nltk.corpus.stopwords.words('english') except Exception as e: print(e) # init vectorizer vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, analyzer='word', stop_words=stopwords) # init list to store duplicate ids: duplicate_ids = {} if prev_dict is None: # get query words and ids as lists query_ids = [job['id'] for job in cur_dict.values()] query_words = [job['blurb'] for job in cur_dict.values()] # Returns cosine similarity between jobs as square matrix (n,n) similarities = cosine_similarity(vectorizer.fit_transform(query_words)) # Fills diagonals with 0, so whole dict does not get popped fill_diagonal(similarities, 0) # init index index = 0 # Identifies duplicates and stores them in duplicate_ids dictionary while True: # Loop breaks when index is equal to matrix height if index == len(similarities): break # Deletes row and column, every time a max is found for a job id. if np_max(similarities[index]) >= max_similarity: # Query ids are popped so index always matches correct element. duplicate_ids.update( {query_ids[index]: cur_dict.pop(query_ids.pop(index))}) # Reduce matrix dimensions, (n-1, n-1) similarities = np_delete(similarities, index, axis=0) similarities = np_delete(similarities, index, axis=1) else: # Increment index by one index += 1 # log something logging.info(f'Found and removed {len(duplicate_ids.keys())} ' f're-posts/duplicates via TFIDF cosine similarity!') else: # Checks current scrape for re-posts/duplicates duplicate_ids = tfidf_filter(cur_dict) # get query words and ids as lists query_ids = [job['id'] for job in cur_dict.values()] query_words = [job['blurb'] for job in cur_dict.values()] # get reference words as list reference_words = [job['blurb'] for job in prev_dict.values()] # fit vectorizer to entire corpus vectorizer.fit(query_words + reference_words) # set reference tfidf for cosine similarity later references = vectorizer.transform(reference_words) # calculate cosine similarity between reference and current blurbs similarities = cosine_similarity(vectorizer.transform(query_words), references) # get duplicate job ids and pop them for sim, query_id in zip(similarities, query_ids): if np_max(sim) >= max_similarity: duplicate_ids.update({query_id: cur_dict.pop(query_id)}) # log something logging.info(f'found {len(cur_dict.keys())} unique listings and ' f'{len(duplicate_ids.keys())} duplicates ' f'via TFIDF cosine similarity') # Returns a dictionary of duplicates return duplicate_ids