def on_track(self, line): """Are we still on track (that is, is our distance to the given line small enough? If not, better re-navigate""" if line is None: return False else: line = Line(*line) return line.distance(self.tracker.getPosition()) < THRESHOLD
def _select_optimal_threshold(self, thresholds: np.ndarray, spot_counts: List[int]) -> float: # calculate the gradient of the number of spots grad = np.gradient(spot_counts) self._grad = grad optimal_threshold_index = np.argmin(grad) # only consider thresholds > than optimal threshold thresholds = thresholds[optimal_threshold_index:] grad = grad[optimal_threshold_index:] # if all else fails, return 0. selected_thr = 0 if len(thresholds) > 1: distances = [] # create a line whose end points are the threshold and and corresponding gradient value # for spot_counts corresponding to the threshold start_point = Point(thresholds[0], grad[0]) end_point = Point(thresholds[-1], grad[-1]) line = Line(start_point, end_point) # calculate the distance between all points and the line for k in range(len(thresholds)): p = Point(thresholds[k], grad[k]) dst = line.distance(p) distances.append(dst.evalf()) # remove the end points thresholds = thresholds[1:-1] distances = distances[1:-1] # select the threshold that has the maximum distance from the line # if stringency is passed, select a threshold that is n steps higher, where n is the # value of stringency if distances: thr_idx = np.argmax(np.array(distances)) if thr_idx + self.stringency < len(thresholds): selected_thr = thresholds[thr_idx + self.stringency] else: selected_thr = thresholds[thr_idx] return selected_thr
def Choose_cluster_number_distance(self, dict1): list_2 = sorted(dict1.items(), key=lambda item: item[0]) point1 = Point(list_2[0]) point2 = Point(list_2[-1]) line = Line(point1, point2) Q = 1 dis = {} for p in list_2: P = Point(p) D = line.distance(P) dis[Q] = D Q = Q + 1 list_1 = sorted(dis.items(), key=lambda item: item[1], reverse=True) h_sc = list_1[0] cluster_number = h_sc[0] # higher_score = h_sc[1] logger.info("Best K value is" + str(cluster_number)) return cluster_number
def get_main_content(content): string_helper = StringHelper() time_start = time.time() content = string_helper.replace_unclose_br_tag(content) content = string_helper.replace_unclose_img_tag(content) content = string_helper.replace_unclose_input_tag(content) soup = BeautifulSoup(content, 'html.parser') soup_backup = BeautifulSoup(content, 'html.parser') soup_helper = SoupHelper() [s.extract() for s in soup('script')] [s.extract() for s in soup('iframe')] [s.extract() for s in soup('a')] [s.extract() for s in soup('header')] [s.extract() for s in soup('label')] [s.extract() for s in soup('option')] [s.extract() for s in soup('head')] [s.extract() for s in soup('footer')] [s.extract() for s in soup('style')] [s.extract() for s in soup('noscript')] [s.extract() for s in soup('video')] [s.extract() for s in soup('videolist')] [s.extract() for s in soup('source')] [s.extract() for s in soup('track')] comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] l_content_length = [] l_content = [] l_path = [] d_text_path = get_text_paths(soup)[0] def get_node_position(node: str): return int(node.split('_')[-1]) def get_path_score(path: str) -> int: score = 0 l_node = path.split(' ') l_node.reverse() for index, node in enumerate(l_node): pos = get_node_position(node) score += ((3**index) * pos) return score for text_path in d_text_path: if text_path.strip() == 'text_node': continue if len(d_text_path[text_path].strip()) < 50: continue path = soup_helper.convert_text_path_to_path(text_path) l_path.append(path) l_content_length.append(len(d_text_path[text_path])) l_content.append(d_text_path[text_path]) # for content in l_content: # print(content) # print("=========") X = [] plot_x = [] plot_y = [] for index, element in enumerate(l_content_length): X.append([get_path_score(l_path[index]), element]) plot_x.append(get_path_score(l_path[index])) plot_y.append(element) X = np.array(X) plot_score = [] plot_num = [] highest_score = 0 total_case = len(X) if len(X) < 10 else 10 for i in range(1, total_case, 1): current_kmeans = k_means(X, i) if i == 1: highest_score = current_kmeans[2] num_rate = highest_score / 10 plot_score.append(round(current_kmeans[2], 2)) plot_num.append(i * num_rate) p1 = Point(plot_num[0], plot_score[0], evaluate=False) p2 = Point(plot_num[-1], plot_score[-1], evaluate=False) line = Line(p1, p2, evaluate=False) highest_score = 0 k = 1 for i, _ in enumerate(plot_score): if i > 1: A = Point((plot_num[i - 1], plot_score[i - 1]), evaluate=False) B = (plot_num[i - 2], plot_score[i - 2]) distance = float(line.distance(A)) if highest_score < distance: highest_score = distance k = i kmeans = k_means(X, k) plot_cent_x = kmeans[0][:, 0] plot_cent_y = kmeans[0][:, 1] l_label = kmeans[1] d_label = {} for index, label in enumerate(l_label): if label not in d_label: d_label[label] = [] d_label[label].append(l_content_length[index]) candidate_label = 0 highest_score = 0 for label in d_label: # score = (sum(d_label[label]) / len(d_label[label])) * math.log(len(d_label[label]) + 1, 5) * math.log10(sum(d_label[label])) score = sum(d_label[label]) if score > highest_score: highest_score = score candidate_label = label l_result = [] l_result_path = [] for index, label in enumerate(l_label): if label == candidate_label: l_result.append(l_content[index]) l_result_path.append(l_path[index]) def get_common_path(l_path): laboratory_path = l_path[0] while True: l_laboratory_node = laboratory_path.split(' ') l_laboratory_node = l_laboratory_node[:len(l_laboratory_node) - 1] laboratory_path = ' '.join(l_laboratory_node) flag = True for path in l_path: if laboratory_path not in path: flag = False break if flag == True: return laboratory_path # plt.scatter(plot_x, plot_y, s=3) # plt.scatter(plot_cent_x, plot_cent_y, color='red', s=4) # plt.show() return ' '.join(l_result)
def thr_calculator(filtered_img, min_distance, stringency): """ Function used to calculate the threshold to use for the dots counting in a 2D image. Parameters: ----------- filtered_img: np.array float64 preprocessed image used to count the dots. min_distance: int minimum distance that two maxima need to have in order to be defined as separete peaks. stringency: int integer used to select the stringency of the generated threshold. By adding stringency to the thr_idx we can select a Thr with higher value from the thr_array. Returns: ----------- counting_dict : dict dictionary containing all the counting infos: selected_thr: float64 Thr used for counting after application of the stringency. calculated_thr: float64 Calculated Thr selected_peaks: int64 2D coords of the peaks defined using the selected_thr. thr_array: float64 Thr array of 100 points distributed between (Img.min(),Img.max()). peaks_coords: float64 list of all the 3D coords calculated using the Thr array. total_peaks: list of int List of the peaks counts. thr_idx: int64 index of the calculated threshold. stringency: int64 stringency used for the identification of the selected_peaks """ # List with the total peaks calculated for each threshold total_peaks = [] # List of ndarrays with the coords of the peaks calculated for each threshold peaks_coords = [] # Define the Thr array to be tested thr_array = np.linspace(filtered_img.min(), filtered_img.max(), num=100) # Calculate the number of peaks for each threshold. In this calculation # the size of the objects is not considered for thr in thr_array: # The border is excluded from the counting peaks = feature.peak_local_max(filtered_img,min_distance=min_distance,\ threshold_abs=thr,exclude_border=False, indices=True,\ num_peaks=np.inf, footprint=None,labels=None) # Stop the counting when the number of peaks detected falls below 3 if len(peaks) <= 3: stop_thr = thr # Move in the upper loop so you will stop at the previous thr break else: peaks_coords.append(peaks) total_peaks.append(len(peaks)) # Consider the case of no detectected peaks or if there is only one Thr # that create peaks (list total_peaks have only one element and ) # if np.array(total_peaks).sum()>0 or len(total_peaks)>1: if len(total_peaks) > 1: # Trim the threshold array in order to match the stopping point # used the [0][0] to get the first number and then take it out from list thr_array = thr_array[:np.where(thr_array == stop_thr)[0][0]] # Calculate the gradient of the number of peaks distribution grad = np.gradient(total_peaks) # Restructure the data in order to avoid to consider the min_peak in the # calculations # Coord of the gradient min_peak grad_min_peak_coord = np.argmin(grad) # Trim the data to remove the peak. trimmed_thr_array = thr_array[grad_min_peak_coord:] trimmed_grad = grad[grad_min_peak_coord:] if trimmed_thr_array.shape > (1, ): # Trim the coords array in order to maintain the same length of the # tr and pk trimmed_peaks_coords = peaks_coords[grad_min_peak_coord:] trimmed_total_peaks = total_peaks[grad_min_peak_coord:] # To determine the threshold we will determine the Thr with the biggest # distance to the segment that join the end points of the calculated # gradient # Distances list distances = [] # Calculate the coords of the end points of the gradient p1 = Point(trimmed_thr_array[0], trimmed_grad[0]) p2 = Point(trimmed_thr_array[-1], trimmed_grad[-1]) # Create a line that join the points s = Line(p1, p2) allpoints = np.arange(0, len(trimmed_thr_array)) # Calculate the distance between all points and the line for p in allpoints: dst = s.distance(Point(trimmed_thr_array[p], trimmed_grad[p])) distances.append(dst.evalf()) # Remove the end points from the lists trimmed_thr_array = trimmed_thr_array[1:-1] trimmed_grad = trimmed_grad[1:-1] trimmed_peaks_coords = trimmed_peaks_coords[1:-1] trimmed_total_peaks = trimmed_total_peaks[1:-1] trimmed_distances = distances[1:-1] # Determine the coords of the selected Thr # Converted trimmed_distances to array because it crashed # on Sanger. if trimmed_distances: # Most efficient way will be to consider the length of Thr list thr_idx = np.argmax(np.array(trimmed_distances)) calculated_thr = trimmed_thr_array[thr_idx] # The selected threshold usually causes oversampling of the number of dots # I added a stringency parameter (int n) to use to select the Thr+n # for the counting. It selects a stringency only if the trimmed_thr_array # is long enough if thr_idx + stringency < len(trimmed_thr_array): selected_thr = trimmed_thr_array[thr_idx + stringency] selected_peaks = trimmed_peaks_coords[thr_idx + stringency] thr_idx = thr_idx + stringency else: selected_thr = trimmed_thr_array[thr_idx] selected_peaks = trimmed_peaks_coords[thr_idx] # Calculate the selected peaks after removal of the big and small objects # Threshold the image using the selected threshold if selected_thr > 0: img_mask = filtered_img > selected_thr labels = nd.label(img_mask)[0] properties = measure.regionprops(labels) for ob in properties: if ob.area < 6 or ob.area > 200: img_mask[ob.coords[:, 0], ob.coords[:, 1]] = 0 labels = nd.label(img_mask)[0] selected_peaks = feature.peak_local_max( filtered_img, min_distance=min_distance, threshold_abs=selected_thr, exclude_border=False, indices=True, num_peaks=np.inf, footprint=None, labels=labels) if selected_peaks.size: # Intensity counting of the max peaks selected_peaks_int = filtered_img[selected_peaks[:, 0], selected_peaks[:, 1]] else: selected_thr = 0 calculated_thr = 0 selected_peaks = 0 peaks_coords = 0 total_peaks = 0 thr_idx = 0 selected_peaks_int = 0 trimmed_thr_array = 0 trimmed_peaks_coords = 0 else: selected_thr = 0 calculated_thr = 0 selected_peaks = 0 peaks_coords = 0 total_peaks = 0 thr_idx = 0 selected_peaks_int = 0 trimmed_thr_array = 0 trimmed_peaks_coords = 0 else: selected_thr = 0 calculated_thr = 0 selected_peaks = 0 peaks_coords = 0 total_peaks = 0 thr_idx = 0 selected_peaks_int = 0 trimmed_thr_array = 0 trimmed_peaks_coords = 0 else: selected_thr = 0 calculated_thr = 0 selected_peaks = 0 peaks_coords = 0 total_peaks = 0 thr_idx = 0 selected_peaks_int = 0 trimmed_thr_array = 0 trimmed_peaks_coords = 0 counting_dict = {} counting_dict['selected_thr'] = selected_thr counting_dict['calculated_thr'] = calculated_thr counting_dict['selected_peaks'] = selected_peaks counting_dict['thr_array'] = thr_array counting_dict['trimmed_thr_array'] = trimmed_thr_array counting_dict['peaks_coords'] = peaks_coords counting_dict['trimmed_peaks_coords'] = trimmed_peaks_coords counting_dict['total_peaks'] = total_peaks counting_dict['thr_idx'] = thr_idx counting_dict['stringency'] = stringency counting_dict['selected_peaks_int'] = selected_peaks_int return counting_dict