def segment_character2(img_gray): gray = img_gray.copy() _, img_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) img_bin = remove_noise(img_bin, 3) kernel = np.ones((2, 1)) erosion = cv2.morphologyEx( img_bin, cv2.MORPH_OPEN, kernel) # cv2.erode(img_bin, kernel, iterations=1) cv2.imshow('erosion', erosion) ero_inv = cv2.subtract(255, erosion) img_rlsa = rlsa.rlsa(ero_inv, True, True, 10) res = cv2.subtract(255, img_rlsa) cv2.imshow('res', res) contours, _ = cv2.findContours(res, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) for c in contours: (x, y, w, h) = cv2.boundingRect(c) if h > 3: cv2.rectangle(gray, (x, y), (x + w, y + h), (0, 0, 0), 1) cv2.imshow('final', gray) return
def get_rlsa_output(image): """ Function to return rlsa output after running rlsa on the binary iamge """ image_rlsa_horizontal = rlsa.rlsa(image, 1, 0, 50)# performing rlsa algorithm on the binary image image_rlsa_horizontal_inverted = cv2.bitwise_not(image_rlsa_horizontal)# inverting the image return image_rlsa_horizontal_inverted
def extract_title(img): image = cv2.imread(img) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) (thresh, binary) = cv2.threshold(gray, 100, 200, cv2.THRESH_BINARY | cv2.THRESH_OTSU) # cv2.imshow('binary', binary) cv2.imwrite('binary.png', binary) (contours, _) = cv2.findContours(~binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # find contours for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) cv2.rectangle(image, (x, y), (x + w, y + h), (0, 200, 0), 1) # cv2.imshow('contour', image) cv2.imwrite('contours.png', image) # cv2.waitKey(0) # cv2.destroyAllWindows() mask = np.ones(image.shape[:2], dtype="uint8") * 200 (contours, _) = cv2.findContours(~binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) heights = [cv2.boundingRect(contour)[3] for contour in contours] avgheight = sum(heights) / len(heights) for c in contours: [x, y, w, h] = cv2.boundingRect(c) if h > 2 * avgheight: cv2.drawContours(mask, [c], -1, 0, -1) # cv2.imshow('filter', mask) cv2.imwrite('filter.png', mask) x, y = mask.shape value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20 #heuristic mask = rlsa.rlsa(mask, True, False, value) #rlsa application # cv2.imshow('rlsah', mask) cv2.imwrite('rlsah.png', mask) (contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # find contours mask2 = np.ones(image.shape, dtype="uint8") * 200 # blank 3 layer image for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) if w > 0.60 * image.shape[1]: # width heuristic applied title = image[y:y + h, x:x + w] mask2[y:y + h, x:x + w] = title # copied title contour onto the blank image image[y:y + h, x:x + w] = 200 # nullified the title contour on original image # cv2.imshow('title', mask2) cv2.imwrite('title.png', mask2) # cv2.imshow('content', image) # cv2.imshow('content.png', image) cv2.waitKey(0) cv2.destroyAllWindows() title = pytesseract.image_to_string(Image.fromarray(mask2)) # print(title) # title = title.split(" ") # print(title) return title
def get_roi(image): """ Возвращает координаты вершин прямоугольников, содержащих области интереса на изображении Аргумент image: ndarray - изображение Возвращает coordinates: list<list> - список списков формата [x0, x1, y0, y1] roi: list<ndarray> - список изображений блоков контента """ ret, bin_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) horizontal_smoothed_image = rlsa.rlsa(bin_image, True, False, 16) vertical_smoothed_image = rlsa.rlsa(bin_image, False, True, 8) smoothed_image = horizontal_smoothed_image & vertical_smoothed_image cv2.waitKey(0) ret01, inv_smoothed_image = cv2.threshold( smoothed_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) dilation_kernel = np.ones((3, 3), dtype=int) dilated_image = cv2.dilate(inv_smoothed_image, dilation_kernel, iterations=2) ret2, labels = cv2.connectedComponents(dilated_image) roi = [] roi_coordinates = [] for label in range(1, ret2): area = np.where(labels == label) roi_coordinates.append([ np.amin(area[1]), np.amax(area[1]), np.amin(area[0]), np.amax(area[0]) ]) roi.append(image[np.amin(area[0]):np.amax(area[0]), np.amin(area[1]):np.amax(area[1])]) return roi_coordinates, roi
def _extract_boxes(self, layout): ''' Desc: extract blocks in the layout, it could be a text block or an image block Args: - layout (LayoutUtils) Returns: - a list of Paragraph or Image (Block) ''' img_src = layout.get_src() # preprocess img_pre = self._preprocess(img_src) # rlsa img_rlsa = rlsa.rlsa(img_pre, True, True, 10) # dilation img_rlsa = 255 - img_rlsa # invert color kernel = np.ones(config.BOX_DILATE_KERNEL, np.uint8) dilate = cv2.dilate(img_rlsa, kernel, iterations=config.BOX_DILATE_ITER) # find bbox bboxes = self._calculate_bbox(dilate) # calculate order graph = self._build_graph(bboxes) order = graph.topological_sort() # extract subgraph box boxes = [] for id_ in order: bbox = bboxes[id_] x1, y1, x2, y2 = bbox.get_coords() subgraph = img_src[y1:y2, x1:x2] # cnn check type type_, _ = self._classifier.classify(subgraph) if type_ == 'Text': para_box = Paragraph(id_, subgraph) boxes.append(para_box) elif type_ == 'Image': img_box = Image(id_, subgraph) boxes.append(img_box) return boxes
def run_RSLA(image_filename, scale_percent=25, rsla_thresh_h=10, rsla_thresh_v=10, contour_area=5): #todo revisit these defaults ''' :param image_filename: path to image :param scale_percent: percent to scale image before rsla, should divide 100 :param rsla_thresh_h: threshold for horizontal rsla :param rsla_thresh_v: threshold for vertical rsla :param contour_area: minimum acceptible contour region area :return: list of bounding boxes ''' bounding = [] image = cv2.imread(image_filename) orig_wh = image.shape[:-1] width = int(image.shape[1] * scale_percent / 100) height = int(image.shape[0] * scale_percent / 100) unscale = 100/scale_percent dim = (width, height) # resize image resized = cv2.resize(image, dim, interpolation=cv2.INTER_AREA) gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY) (thresh, image_binary) = cv2.threshold( gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU ) image_rlsa_horizontal = rlsa.rlsa(image_binary, True, False, rsla_thresh_h) image_rlsa_vertical = rlsa.rlsa(image_binary, False, True, rsla_thresh_v) combo = np.bitwise_or(image_rlsa_horizontal, image_rlsa_vertical) combo = cv2.bitwise_not(combo) _, contours, _ = cv2.findContours(combo, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE) for contour in contours: if cv2.contourArea(contour) > contour_area: # combo = cv2.drawContours(combo, contour, -1, (0, 0, 255), thickness=10) x, y, w, h = cv2.boundingRect(contour) # cv2.rectangle(image, (x * 4, y * 4), ((x+w)*4,(y+h)*4), color=(0,0,255)) bounding.append((x * unscale, y * unscale, (x+w)*unscale, (y+h)*unscale)) return bounding
def connect_horizontal(img_bin, rlsa_val=47): """Connect dots horizontal""" og = img_bin.copy() # Setting RLSA RLSA_VALUE = rlsa_val RLSA_HORIZONTAL = True RLSA_VERTICAL = False img_bin = cv2.subtract(255, img_bin) img_rlsa = rlsa.rlsa(img_bin, RLSA_HORIZONTAL, RLSA_VERTICAL, RLSA_VALUE) img_rlsa = cv2.subtract(255, img_bin) return img_rlsa
def segment_words(img_gray, rlsa_val=7, bin_result=False): """ Segment words with RLSA params: img_gray::ndarray:~ grayscale image rlsa_val::integer:~ value for run length smoothing algorithm Returns a list of tuple -> ((x,y,w,h), image_array) """ gray = img_gray.copy() _, img_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) img_bin = remove_noise(img_bin, 30) img_bin_og = img_bin.copy() img_bin = cv2.subtract(255, img_bin) img_rlsa = rlsa.rlsa(img_bin, True, True, rlsa_val) res = cv2.subtract(255, img_rlsa) contours, _ = cv2.findContours(res, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) res = [] for c in contours: (x, y, w, h) = cv2.boundingRect(c) if h > 3: if bin_result: cropped_img = img_bin_og[y:y + h, x:x + w] else: cropped_img = gray[y:y + h, x:x + w] zipp = ((x, y, w, h), cropped_img) res.append(zipp) return res
def handleFileUpload(): pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' filenames = [img for img in glob.glob("images/*.png")] images = [] for img in filenames: n = cv2.imread(img) images.append(n) image = cv2.imread(img) # reading the image gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # convert2grayscale (thresh, binary) = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) # convert2binary contours, hierarchy = cv2.findContours(~binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # find contours for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 1) mask = np.ones( image.shape[:2], dtype="uint8" ) * 255 # create blank image of same dimension of the original image contours, hierarchy = cv2.findContours(~binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) heights = [cv2.boundingRect(contour)[3] for contour in contours] # collecting heights of each contour avgheight = sum(heights) / len(heights) # average height # finding the larger contours # Applying Height heuristic for c in contours: [x, y, w, h] = cv2.boundingRect(c) if h > 2 * avgheight: cv2.drawContours(mask, [c], -1, 0, -1) x, y = mask.shape value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20 #heuristic mask = rlsa.rlsa(mask, True, False, value) #rlsa application contours, hierarchy = cv2.findContours( ~mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # find contours mask2 = np.ones(image.shape, dtype="uint8") * 255 # blank 3 layer image for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) if w > 0.60 * image.shape[1]: # width heuristic applied title = image[y:y + h, x:x + w] mask2[y:y + h, x:x + w] = title # copied title contour onto the blank image image[y:y + h, x:x + w] = 255 # nullified the title contour on original image title = pytesseract.image_to_string(Image.fromarray(mask2)) im = cv2.imread(img) content = pytesseract.image_to_string(im) path_to_dir = 'images/' # path to directory you wish to remove files_in_dir = os.listdir( path_to_dir) # get list of files in the directory for file in files_in_dir: # loop to delete each file in folder os.remove(f'{path_to_dir}/{file}') # os.remove(img) content = content.replace("\n", " ") d = {'title': [title], 'text': [content], 'author': ["Beekash Mohanty"]} df_test = pd.DataFrame(data=d) #stopwords = {x: 1 for x in stopwords.words('english')} non_alphanums = re.compile(u'[^A-Za-z0-9]+') def normalize_text(text): return u" ".join( [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \ if len(x) > 1 and x not in stopwords]) print("Loading models") pickle_model = "models/wb_transform.pkl" clf1 = pkl.load(gzip.open(pickle_model, 'rb')) stemmer = SnowballStemmer("english") def preprocess(df): df['author'].fillna('No author', inplace=True) df['title'].fillna('No title', inplace=True) df['text'].fillna('No text', inplace=True) #search author encoded df_author = pd.read_csv('author_cat.csv') #TODO check at notebook the values for the author and the equal query set the cateory id right df['author_cat'] = 1 df['stemmed_title'] = df['title'].map( lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')])) df['stemmed_text'] = df['text'].map( lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')])) # drop the title autor and text df.drop(['title', 'author', 'text'], axis=1, inplace=True) return df df = preprocess(df_test) vectorizer = HashingVectorizer(normalize_text, decode_error='ignore', n_features=2**23, non_negative=False, ngram_range=(1, 2), norm='l2') X_title = vectorizer.transform(df['stemmed_title']) #X_title = X_title[:, np.array(np.clip(X_title.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_text = vectorizer.transform(df['stemmed_text']) #X_text = X_text[:, np.array(np.clip(X_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_author = df['author_cat'].values X_author = X_author.reshape(-1, 1) sparse_merge = hstack((X_title, X_text, X_author)).tocsr() # Remove features with document frequency <= 100 mask100 = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 1, 0), dtype=bool) X = sparse_merge[:, mask100] print(X.shape) print('Loading model to predict...') print('Loading model to predict...') y1 = clf1.predict(X) bloblist_desc = list() df_usa_descr_str = df_test['stemmed_text'].astype(str) for row in df_usa_descr_str: blob = TextBlob(row) bloblist_desc.append( (row, blob.sentiment.polarity, blob.sentiment.subjectivity)) df_usa_polarity_desc = pd.DataFrame( bloblist_desc, columns=['sentence', 'sentiment', 'polarity']) tweet_counts = loaded_model.method.transform(df_test['stemmed_text']) predictions = loaded_model.classifier.predict(tweet_counts) def f(df_usa_polarity_desc): if df_usa_polarity_desc['sentiment'] > 0: val = "Positive" elif df_usa_polarity_desc['sentiment'] == 0: val = "Neutral" else: val = "Negative" return val df_usa_polarity_desc["Sentiment_Type"] = df_usa_polarity_desc.apply(func=f, axis=1) cal = np.round(y1, 5) * 100 if cal > 98: m = "This News is Fake" elif cal > 90 and cal < 98: m = "This News is more likely a Fake" else: m = "This News is Real" return render_template( "upload.html", prediction_text="Fake Rate={}".format(np.round(y1, 4) * 100) + "%" + "->" + m + " " + " Sentiments=" + df_usa_polarity_desc["Sentiment_Type"].values + " " + "Category=" + predictions)
def process_image(path_to_image, empty_output, output_dir): output_path = os.path.dirname(path_to_image) last_folder_name = os.path.basename(output_path) image_name = os.path.basename(path_to_image) image_sans_ext = os.path.splitext(image_name)[0] # check if file exists here and exist if not try: f = open(path_to_image) f.close() except FileNotFoundError: logging.critical('Given image does not exist') sys.exit(0) logging.info(f"Processing {image_name}") founds = glob.glob(f'{output_dir}/{image_sans_ext}-*.xml') if len(founds) > 0: logging.info(f"FILE EXISTS: {founds}") return # standardize size of the images maintaining aspect ratio if empty_output: files = glob.glob('{}/*'.format(output_dir)) for f in files: os.remove(f) image = cv2.imread(path_to_image) #reading the image image_height = image.shape[0] image_width = image.shape[1] if image_width != 2048: image = imutils.resize(image, width=2048) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # converting to grayscale image # applying thresholding technique on the grayscale image # all pixels value above 0 will be set to 255 but because we are using THRESH_OTSU # we have avoid have to set threshold (i.e. 0 = just a placeholder) since otsu's method does it automatically (thresh, im_bw) = cv2.threshold( gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # converting to binary image # invert image data using unary tilde operator # im_bw = ~im_bw # Noise removal step - Perform opening on the thresholded image (erosion followed by dilation) kernel = np.ones((2, 2), np.uint8) # kernel noise size (2,2) im_bw = cv2.morphologyEx( im_bw, cv2.MORPH_OPEN, kernel) # cleans up random lines that appear on the page if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-im-negative.png'), im_bw) if logging.getLogger().level == logging.DEBUG: cv2.imwrite(os.path.join(output_dir, f'{image_sans_ext}-im-bw.png'), ~im_bw) # extract and draw any lines from the image lines_mask = draw_lines(image, gray) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-lines-mask.png'), lines_mask) # debug remove # extract complete shapes likes boxes of ads and banners found_polygons_mask = extract_polygons(im_bw, lines_mask) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-found-polygons-mask.png'), found_polygons_mask) # debug remove # nullifying the mask of unwanted polygons over binary (toss images) # this should not only have texts, without images text_im_bw = cv2.bitwise_and(im_bw, im_bw, mask=found_polygons_mask) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-text-im-bw-negative.png'), ~text_im_bw) # initialize blank image for extracted titles titles_mask = np.ones(image.shape[:2], dtype="uint8") * 255 contents_mask = np.ones(image.shape[:2], dtype="uint8") * 255 (contours, _) = cv2.findContours(text_im_bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) heights = [cv2.boundingRect(contour)[3] for contour in contours] avgheight = sum(heights) / len(heights) title_widths = [] content_widths = [] if logging.getLogger().level == logging.DEBUG: debug_contents_mask = np.ones( image.shape, dtype="uint8") * 255 # blank 3 layer image for debug colour # finding the larger text for c in contours: [x, y, w, h] = cv2.boundingRect(c) cv2.rectangle(contents_mask, (x, y), (x + w, y + h), (255, 0, 0), 1) if h > 2 * avgheight: cv2.drawContours(titles_mask, [c], -1, 0, -1) title_widths.append(w) elif h * w > 20: # remove specks on dots # get the biggest chunks of texts... articles! cv2.drawContours(contents_mask, [c], -1, 0, -1) content_widths.append(w) if logging.getLogger().level == logging.DEBUG: cv2.drawContours(debug_contents_mask, [c], -1, 0, -1) cv2.rectangle(debug_contents_mask, (x, y), (x + w, y + h), (0, 255, 0), 1) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-debug_drawn_contours.png'), debug_contents_mask) # helps further detach titles if necessary. This step can be removed # titles_mask = cv2.erode(titles_mask, kernel, iterations = 1) m_height, m_width = titles_mask.shape # get image dimensions, height and width # make 2D Image mask of proto-original image for cutting contents image_mask = np.ones(image.shape, dtype="uint8") * 255 # blank 3 layer image image_mask[0:m_height, 0:m_width] = image[0:m_height, 0:m_width] # run length smoothing algorithms for vertical and lateral conjoining of pixels value = math.ceil(sum(title_widths) / len(title_widths)) * 2 logging.info(f'RLSA Title Value {value}') rlsa_titles_mask = rlsa.rlsa(titles_mask, True, False, value) #rlsa application rlsa_titles_mask_for_final = rlsa_titles_mask if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-rlsa-titles-mask.png'), rlsa_titles_mask) # debug remove value = math.ceil(sum(content_widths) / len(content_widths)) * 3 logging.info(f'RLSA Content Value {value}') rlsa_contents_mask = rlsa.rlsa(contents_mask, False, True, value) #rlsa application rlsa_contents_mask_for_avg_width = rlsa_contents_mask if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-rlsa-contents-mask.png'), rlsa_contents_mask) # debug remove # get avg properties of columns contents_sum_list, contents_x_list, for_avgs_contours_mask = column_summaries( image, rlsa_contents_mask_for_avg_width) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-for-avgs-contours-mask.png'), for_avgs_contours_mask) # debug remove trimmed_mean = int(stats.trim_mean(contents_sum_list, 0.1)) # trimmed mean leftmost_x = min(contents_x_list) threshold = 2500 # remove tiny contours that dirtify the image ### titles work (contours, _) = cv2.findContours(~rlsa_titles_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # apply some heuristic to differentiate other stranger things masquerading as titles nt_contours = [ contour for contour in contours if cv2.boundingRect(contour)[2] * cv2.boundingRect(contour)[3] > threshold ] total_columns = int(image.shape[1] / trimmed_mean) contours = sorted( nt_contours, key=lambda contour: determine_precedence( contour, total_columns, trimmed_mean, leftmost_x, m_height)) clear_titles_mask = redraw_titles(image, contours) # draw_columns(leftmost_x, trimmed_mean, total_columns, clear_titles_mask) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-clear-titles-mask.png'), clear_titles_mask) # debug remove ### contents work (contours, _) = cv2.findContours(~rlsa_contents_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # apply some heuristic to different other stranger things masquerading as titles nt_contours = [ contour for contour in contours if cv2.boundingRect(contour)[2] * cv2.boundingRect(contour)[3] > threshold ] contents_contours = sorted( nt_contours, key=lambda contour: determine_precedence( contour, total_columns, trimmed_mean, leftmost_x, m_height)) clear_contents_mask = redraw_contents(image_mask, contents_contours) # draw_columns(leftmost_x, trimmed_mean, total_columns, clear_contents_mask) if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f'{image_sans_ext}-sorted-clear-contents-mask.png'), clear_contents_mask) # start printing individual letters based on titles! The final act (contours, _) = cv2.findContours(~rlsa_titles_mask_for_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # apply some heuristic to different other stranger things masquerading as titles nt_contours = [ contour for contour in contours if cv2.boundingRect(contour)[2] * cv2.boundingRect(contour)[3] > threshold ] contours = sorted( nt_contours, key=lambda contour: determine_precedence( contour, total_columns, trimmed_mean, leftmost_x, m_height)) article_complete = False title_came_up = True title_count = len(contours) ct_widths = [] article_mask = np.ones( image.shape, dtype="uint8") * 255 # blank layer image for one article letter_root = ET.Element("letter") desc = ET.SubElement(letter_root, "description") ET.SubElement(desc, "MeasurementUnit").text = "pixel" ocv_proc = ET.SubElement(desc, "OPenCVProcessing", pageImage=image_sans_ext) ET.SubElement(ocv_proc, "ProcessingDateTime").text = str(datetime.today()) ET.SubElement(ocv_proc, "Script").text = 'Lettersiterate' layout = ET.SubElement(letter_root, "Layout") page = ET.SubElement(layout, "Page") print_space = ET.SubElement(page, "PrintSpace", height=str(image_height), width=str(image_width), xpos=str(0), ypos=str(0)) # ET.Element(print_space, attrib={'height':image_height, 'width':image_width, 'xpos':0, 'ypos':0}) # for idx, contour in enumerate(contours): for idx, (_curr, _next) in enumerate(zip(contours[::], contours[1::])): # https://www.quora.com/How-do-I-iterate-through-a-list-in-python-while-comparing-the-values-at-adjacent-indices/answer/Jignasha-Patel-14 if article_complete: article_mask = np.ones( image.shape, dtype="uint8" ) * 255 # blank layer image for another separate letter # xml file letter_root = ET.Element("letter") desc = ET.SubElement(letter_root, "description") ET.SubElement(desc, "MeasurementUnit").text = "pixel" ocv_proc = ET.SubElement(desc, "OPenCVProcessing") ET.SubElement(ocv_proc, "ProcessingDateTime").text = str(datetime.today()) ET.SubElement(ocv_proc, "Script").text = 'Lettersiterate' layout = ET.SubElement(letter_root, "Layout") page = ET.SubElement(layout, "Page") print_space = ET.SubElement(page, "PrintSpace", height=str(image_height), width=str(image_width), xpos=str(0), ypos=str(0)) [cx, cy, cw, ch] = cv2.boundingRect(_curr) [nx, ny, nw, nh] = cv2.boundingRect(_next) ct_height = cy + ch # title height in this column ct_widths.append(cx + cw) ct_width = max( ct_widths ) # adjust to get longest title width if multiple line title :) # dont proceed any further if the next title is right below it on same column # continue to next title # current and next have to be within the same column # detect last article in the columns if (idx + 2) == title_count: title_came_up = False elif cy < ny and ny - (nh * 3) < cy and nx < ct_width: # 1) current title is above next # 2) next title is directly above current # 3) next title is withing the length of the current title. Cannot be in another column # and considered directly below current. Phew!, it happened title_came_up = True else: title_came_up = False if not title_came_up: title_encounters = 0 # loop through contents within these boundaries and insert them to the canvas for content_idx, content_contour in enumerate(contents_contours): [x, y, w, h] = cv2.boundingRect(content_contour) content_width = x + w # length -50 is to be safe sometimes the content cut maybe infringe onto the next title # get any content that starts within the title (take out -50) and within the end of the title width # and give (+50), it is still below the title logging.debug( f"{x} >= {cx-50} and {x} <= {ct_width} and {y+50} > {ct_height}" ) if x >= cx - 50 and x <= ct_width and y + 50 > ct_height: # now that we have limited the content to be within the width and below the title of interest # make sure it does not transgress into other titles. The bane of my existence begins, sigh! for tidx, tcontour in enumerate(contours): [tx, ty, tw, th] = cv2.boundingRect(tcontour) # validating titles that are directly below # 1) it has to be greater than the current title # 2) it starts within the width of the current title # 3) it starts within the width of the current content # 4) it does not start left of the content even if we take out 50 pixels to the left (-50) if tidx > idx and tx < ct_width and tx < content_width and tx > x - 50 and title_encounters < 1: # print(f"TITLE BELOW---> ###{content_idx} ##{tidx} > #{idx} and {tx} < {content_width} and {cx} >= {x-50}") article_mask = cutouts(article_mask, clear_contents_mask, content_contour) ET.SubElement(print_space, "BodyText", height=str(h), width=str(w), xpos=str(x), ypos=str(y), contourId=str(idx), bodyTextContourId=str(content_idx)) # cv2.putText(article_mask, "###{content_idx},{x},{y}.{w},{h}", cv2.boundingRect(content_contour)[:2], cv2.FONT_HERSHEY_PLAIN, 1.50, [255, 0, 0], 2) title_encounters += 1 # hitting a title in this case means we don't need to go any further for current content break # validating titles that are on a different column # 1)it has to be greater than the current title # 2)it starts within the width of the current title # 3)it starts below this content but within the contents limits (meaning it is multicolumn extension) if tidx > idx and tx < ct_width and ( ty > y and tx > x - 50) and title_encounters < 1: article_mask = cutouts(article_mask, clear_contents_mask, content_contour) ET.SubElement(print_space, "BodyText", height=str(h), width=str(w), xpos=str(x), ypos=str(y), contourId=str(idx), bodyTextContourId=str(content_idx)) # validating titles that are at the end of the column # 1) there is no title directly below it if all(x < cv2.boundingRect(tcontour)[0] for tidx, tcontour in enumerate(contours) if tidx > idx and cv2.boundingRect(tcontour)[0] > content_width) and title_encounters < 1: article_mask = cutouts(article_mask, clear_contents_mask, content_contour) ET.SubElement(print_space, "BodyText", height=str(h), width=str(w), xpos=str(x), ypos=str(y), contourId=str(idx), bodyTextContourId=str(content_idx)) if title_came_up: ct_widths.append(cx + cw) article_title_p = clear_titles_mask[cy:cy + ch, cx:cx + cw] article_mask[ cy:cy + ch, cx:cx + cw] = article_title_p # copied title contour onto the blank image ET.SubElement(print_space, "Title", height=str(ch), width=str(cw), xpos=str(cx), ypos=str(cy), contourId=str(idx)) article_complete = False else: ct_widths = [] # reset widths article_title_p = clear_titles_mask[cy:cy + ch, cx:cx + cw] article_mask[ cy:cy + ch, cx:cx + cw] = article_title_p # copied title contour onto the blank image ET.SubElement(print_space, "Title", height=str(ch), width=str(cw), xpos=str(cx), ypos=str(cy), contourId=str(idx)) if (idx + 2) == title_count: # we are at the end article_title_p = clear_titles_mask[ny:ny + nh, nx:nx + nw] article_mask[ ny:ny + nh, nx:nx + nw] = article_title_p # copied title contour onto the blank image file_name = f"article-{str(idx).zfill(2)}" if logging.getLogger().level == logging.DEBUG: cv2.imwrite( os.path.join(output_dir, f"{image_sans_ext}-{file_name}.png"), article_mask) article_complete = True content = pytesseract.image_to_string( Image.fromarray(article_mask)) with open( os.path.join(output_dir, f'{image_sans_ext}-{file_name}.txt'), 'a') as the_file: the_file.write(content) ET.SubElement(page, "TextBlock", articleNo=str(file_name), contourId=str(idx)).text = content tree = ET.ElementTree(letter_root) xml_output_file = os.path.join( output_dir, f'{image_sans_ext}-{file_name}.xml') # this method may cause 'OSError: [Errno 24] Too many open files' and does not prettyprint # tree.write(xml_output_file, encoding='utf8') # OR xmlstr = ET.tostring(letter_root).decode() xmlstr = minidom.parseString(xmlstr).toprettyxml(indent="\t", newl="\n") with open(xml_output_file, 'w+') as outfile: outfile.write(xmlstr)
# file_path = filedialog.askdirectory() # print(file_path) # count_human_true = 0 # count_tree_true = 0 tree = train_classifier() ## for file in os.listdir(file_path): ## file_name = os.path.join(file_path, file) ## counter += 1 ## print("cleaning image " + str(counter) + " to path "+ file_path + '_output_' + file) # open_file = open("samples.txt", "r+") ## beginning file_name = '9_main-qimg-de9e1056b4cd97bbf39f1c8b4ba68f6a.jpg' img = cv2.cvtColor(cv2.imread(file_name), cv2.COLOR_BGR2GRAY) ret, thresh = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY) img_rlsa_horizontal = rlsa.rlsa(thresh, True, False, 10) img_rlsa_vertical = rlsa.rlsa(img_rlsa_horizontal, False, True, 15) # cv2.imwrite("1_rlsa-smoothed.jpg", img_rlsa_vertical) opening = cv2.morphologyEx(img_rlsa_vertical, cv2.MORPH_OPEN, np.ones((3, 3), np.int), iterations=2) # cv2.imwrite('2_opened_image.jpg', opening) sure_bg = cv2.dilate(opening, None, iterations=5) sure_bg = sure_bg - cv2.erode(sure_bg, None) dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5) dist_transform = ((dist_transform - dist_transform.min()) / (dist_transform.max() - dist_transform.min()) * 255).astype(
heights = [cv2.boundingRect(contour)[3] for contour in contours] # collecting heights of each contour avgheight = sum(heights) / len(heights) # average height # finding the larger text for c in contours: [x, y, w, h] = cv2.boundingRect(c) if h > 2 * avgheight: cv2.drawContours(mask, [c], -1, 0, -1) cv2.imshow('mask', mask) x, y = mask.shape # image dimensions value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20 mask = rlsa.rlsa(mask, True, False, value) #rlsa application cv2.imshow('mask1', mask) (_, contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask2 = np.ones(image.shape, dtype="uint8") * 255 # blank 3 layer image for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) if w > 0.60 * image.shape[1]: # width heuristic applied title = image[y:y + h, x:x + w] mask2[y:y + h, x:x + w] = title # copied title contour onto the blank image image[y:y + h, x:x + w] = 255 # nullified the title contour on original image
def process_image(path_to_image, empty_output, out_dir_name): image_name = os.path.basename(path_to_image) img_sans_ext = os.path.splitext(image_name)[0] # check if file exists here and exist if not try: f = open(path_to_image) f.close() except FileNotFoundError: log.critical('Given image does not exist') sys.exit(0) log.info(f"Processing {image_name}") # create out dir current_directory = os.getcwd() final_dir = os.path.join(current_directory, r'dates') if not os.path.exists(final_dir): os.makedirs(final_dir) founds = glob.glob(f'{final_dir}/{img_sans_ext}-*.xml') if len(founds) > 0: log.info(f"FILE EXISTS: {founds}") return # standardize size of the images maintaining aspect ratio if empty_output: files = glob.glob('{}/*'.format(final_dir)) for f in files: os.remove(f) image = cv2.imread(path_to_image) # reading the image image_width = image.shape[1] if image_width != 2048: image = imutils.resize(image, width=2048) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # converting to grayscale image # applying thresholding technique on the grayscale image # all pixels value above 0 will be set to 255 but because # we are using THRESH_OTSU # we have avoid have to set threshold (i.e. 0 = just a placeholder) # since otsu's method does it automatically (thresh, im_bw) = cv2.threshold( gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # converting to binary image # invert image data using unary tilde operator # im_bw = ~im_bw # Noise removal step - Perform opening on the thresholded image # (erosion followed by dilation) kernel = np.ones((2, 2), np.uint8) # kernel noise size (2,2) # cleans up random lines that appear on the page im_bw = cv2.morphologyEx(im_bw, cv2.MORPH_OPEN, kernel) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join(final_dir, f'{img_sans_ext}-im-negative.png'), im_bw) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join(final_dir, f'{img_sans_ext}-im-bw.png'), ~im_bw) # extract and draw any lines from the image lines_mask = draw_lines(image, gray) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join(final_dir, f'{img_sans_ext}-lines-mask.png'), lines_mask) # extract complete shapes likes boxes of ads and banners found_polygons_mask = extract_polygons(im_bw, lines_mask) if log.getLogger().level == log.DEBUG: cv2.imwrite( os.path.join(final_dir, f'{img_sans_ext}-found-polygons-mask.png'), found_polygons_mask) # nullifying the mask of unwanted polygons over binary (toss images) # this should not only have texts, without images text_im_bw = cv2.bitwise_and(im_bw, im_bw, mask=found_polygons_mask) if log.getLogger().level == log.DEBUG: cv2.imwrite( os.path.join(final_dir, f'{img_sans_ext}-text-im-bw-negative.png'), ~text_im_bw) # initialize blank image for extracted contents contents_mask = np.ones(image.shape[:2], dtype="uint8") * 255 (contours, _) = cv2.findContours(text_im_bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) heights = [cv2.boundingRect(contour)[3] for contour in contours] avgheight = sum(heights)/len(heights) content_widths = [] if log.getLogger().level == log.DEBUG: # blank 3 layer image for debug colour debug_mask = np.ones(image.shape, dtype="uint8") * 255 # finding the larger text for c in contours: [x, y, w, h] = cv2.boundingRect(c) cv2.rectangle(contents_mask, (x, y), (x+w, y+h), (255, 0, 0), 1) if h > 2*avgheight: # avoid titles altogether pass elif h*w > 20 and x > 1000 and y < 100: # avoid specks or dots # get the biggest chunks of texts... articles! cv2.drawContours(contents_mask, [c], -1, 0, -1) content_widths.append(w) if log.getLogger().level == log.DEBUG: cv2.drawContours(debug_mask, [c], -1, 0, -1) cv2.rectangle(debug_mask, (x, y), (x+w, y+h), (0, 255, 0), 1) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join( final_dir, f'{img_sans_ext}-debug_drawn_contours.png'), debug_mask) # get image dimensions, height and width m_height, m_width = contents_mask.shape # make 2D Image mask of proto-original image for cutting contents # blank 3 layer image image_mask = np.ones(image.shape, dtype="uint8") * 255 image_mask[0: m_height, 0: m_width] = image[0: m_height, 0: m_width] try: value = math.ceil(sum(content_widths)/len(content_widths))*5 except ZeroDivisionError as e: value = 140 log.info(f'RLSA Content Value {value}') # rlsa application rlsa_contents_mask = rlsa.rlsa(contents_mask, True, False, value) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join( final_dir, f'{img_sans_ext}-rlsa-contents-mask.png'), rlsa_contents_mask) # debug remove threshold = 1500 # remove tiny contours that dirtify the image # contents work (contours, _) = cv2.findContours(~rlsa_contents_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # apply some heuristic to different other stranger things # masquerading as contents contents_contours = [contour for contour in contours if cv2.boundingRect(contour)[2] * cv2.boundingRect(contour)[3] > threshold] # blank layer image for one article article_mask = np.ones(image.shape, dtype="uint8") * 255 # loop through and insert it to the canvas for content_idx, content_contour in enumerate(contents_contours): # https://www.quora.com/How-do-I-iterate-through-a-list-in-python-while-comparing-the-values-at-adjacent-indices/answer/Jignasha-Patel-14 [x, y, w, h] = cv2.boundingRect(content_contour) if x > 1000 and y < 100: log.debug(f"{x} >= {x-50} and {x} {y+50}") article_mask = cutouts(article_mask, image_mask, content_contour) angle, rotated_article_mask = correct_skew(article_mask) log.info(f'Rotation Angle: {angle}') # DIlating the output improved overall readbility by tesseract especially # in cases where resulting output was empty # https://stackoverflow.com/a/54582118/754432 cv2.dilate(rotated_article_mask, (5, 5), rotated_article_mask) if log.getLogger().level == log.DEBUG: cv2.imwrite(os.path.join(final_dir, f"{img_sans_ext}.png"), rotated_article_mask) # 3 Fully automatic page segmentation, but no OSD. (default for tesserocr) # 7 means treat the image as a single text line. # https://medium.com/better-programming/beginners-guide-to-tesseract-ocr-using-python-10ecbb426c3d content = pytesseract.image_to_string( Image.fromarray(rotated_article_mask), config='--psm 3') with open(os.path.join(final_dir, f'{out_dir_name}.csv'), 'a+') as f_out: # Using dictionary keys as fieldnames for the CSV file header writer = csv.writer(f_out, delimiter='\t') # writer = csv.DictWriter(f_out, fieldnames=['file_name', 'raw_date']) writer.writerow([img_sans_ext, content.partition('\n')[0]])
def main(page,args=None): print("processing "+page) #read in data cls_file=os.path.join(args.clsdir,page+'.json') col_rect_file=os.path.join(args.colrectdir,page+'.json') row_rect_file=os.path.join(args.rowrectdir,page+'.json') bg_img = cv2.imread('/home/ubuntu/results/personnel-records/1956/seg/background.png') with open(col_rect_file) as file: col_rects = json.load(file) with open(row_rect_file) as file: row_rects = json.load(file) with open(cls_file) as file: cls = json.load(file) cls=cls['name'] for key in row_rects.keys(): row_rects_col = row_rects[key] col_img=cv2.imread(os.path.join(args.imgdir,page,page+'_'+key+'.png')) col_img_b=Binarization(col_img) RLSA_thr=30#50 _ , M_col = Rect.CropRect(col_img_b, col_rects[int(key)]) for i in range(len(row_rects_col)): if cls[i]=='personnel': #detect symbols row_img_b , _ =Rect.CropRect(col_img_b, Rect.RectOnDstImg(row_rects_col[i],M_col)) count=np.sum(row_img_b/255,axis=0) count=signal.medfilt(count, 5) _, count=cv2.threshold(count, 3, 255, cv2.THRESH_BINARY_INV) count=rlsa.rlsa(count.T, True, False, RLSA_thr) symbol_intervals=SymbolDetection(255-count[-1],RLSA_thr) #decide if we need to move symbols closer if symbol_intervals: for ii in range(len(symbol_intervals[:-1])-1,-1,-1): if np.median(symbol_intervals[ii])>0.35*row_img_b.shape[1]: #copy the region of FName (src) row_img, M_col2row = Rect.CropRect(col_img, Rect.RectOnDstImg(ExpandRect(row_rects_col[i]),M_col)) src_img=row_img[:,symbol_intervals[ii][0]:symbol_intervals[ii][1]].copy() # t is the distance between current and next symbol t = symbol_intervals[ii+1][0] - symbol_intervals[ii][1] M_row2col = np.linalg.inv(M_col2row) # manually setup mask, for better performance we should automatically find a mask (binarization,DP,etc) roi_pts = np.array([[0, 0], [src_img.shape[1], 0], [src_img.shape[1], src_img.shape[0]], [0, src_img.shape[0]]], dtype="float32") # mask w.r.t M_row2col roi_pts = Rect.PtsOnDstImg(roi_pts, M_row2col) roi_pts = roi_pts - np.min(roi_pts,axis=0) height,width = np.max(roi_pts, axis=0)[::-1] mask = np.zeros([min(height,src_img.shape[0]),min(width,src_img.shape[1])]) roi_mask=cv2.fillConvexPoly(mask, roi_pts, 255) # fill the region of FName with random sampled background center = [[np.median(symbol_intervals[ii]), row_img.shape[0] / 2]] center = tuple(Rect.PtsOnDstImg(center,M_row2col,False)[-1]) x , y = np.random.randint(bg_img.shape[0]-roi_mask.shape[0],size=1)[0], np.random.randint(bg_img.shape[1]-roi_mask.shape[1],size=1)[0] try: col_img = cv2.seamlessClone(bg_img[x:x+roi_mask.shape[0],y:y+roi_mask.shape[1]], col_img, roi_mask.astype(np.uint8), center, cv2.NORMAL_CLONE) #paste the src region to target region center = [[np.median(symbol_intervals[ii]) + t, row_img.shape[0] / 2]] center = tuple(Rect.PtsOnDstImg(center, M_row2col, False)[-1]) col_img = cv2.seamlessClone(src_img, col_img, roi_mask.astype(np.uint8), center, cv2.NORMAL_CLONE) symbol_intervals[ii]=[symbol_intervals[ii][0]+t,symbol_intervals[ii][1]+t] except: # get error if part of src img is out of dst image # compute on original image can avoid this problem, but this is much faster and there is no big difference print("ignore first/last row for "+page+'_'+key ) cls=cls[len(row_rects_col):] if not os.path.isdir(os.path.join(args.outputdir,page)): os.mkdir(os.path.join(args.outputdir,page)) print('creating directory ' + os.path.join(args.outputdir,page)) cv2.imwrite(os.path.join(args.outputdir,page,page+'_'+key+'.png'),col_img)
for c in contours: [x, y, w, h] = cv2.boundingRect(c) if h > 2 * avgheight: cv2.drawContours(mask_titles, [c], -1, 0, -1) else: cv2.drawContours(mask_contents, [c], -1, 0, -1) #cv2.imshow('mask_titles', mask_titles) cv2.imwrite('mask_titles.png', mask_titles) #cv2.imshow('mask_contents', mask_contents) cv2.imwrite('mask_contents.png', mask_contents) x, y = mask_titles.shape # image dimensions value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20 rlsa_titles_mask = rlsa.rlsa(mask_titles, True, False, value) #rlsa application rlsa_titles_mask_for_final = rlsa.rlsa(mask_titles, True, False, value) #rlsa application cv2.imwrite('rlsa_title_mask.png', rlsa_titles_mask) value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20 rlsa_contents_mask = rlsa.rlsa(mask_contents, False, True, value) #rlsa application rlsa_contents_mask_for_avg_width = rlsa.rlsa(mask_contents, False, True, value) #rlsa application cv2.imwrite('rlsa_contents_mask.png', rlsa_contents_mask) cv2.imwrite('rlsa_contents_mask_for_avg_width.png', rlsa_contents_mask_for_avg_width) # CALC AVG WIDTHS?! (for_avgs_contours, _) = cv2.findContours(~rlsa_contents_mask_for_avg_width,
# finding the larger text for idx, contour in enumerate(contours): [x, y, w, h] = cv2.boundingRect(contour) # cv2.rectangle(image, (x,y), (x+w,y+h), (0, 255, 0), 1) if h > 2 * avgheight: cv2.drawContours(mask, [contour], -1, 0, -1) # heading like contours else: cv2.drawContours(mask_content, [contour], -1, 0, -1) # everything else not heading-like cv2.imshow('contour', image) # on original image cv2.imwrite('contours.png', image) # attempt to get large content blocks image_rlsa = rlsa.rlsa(mask_content, True, True, 10) # both hori and verti (contours, _) = cv2.findContours(~image_rlsa, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) heights = [cv2.boundingRect(contour)[3] for contour in contours] # collecting heights of each contour avgheight = sum(heights) / len(heights) # average height print(avgheight, 3 * avgheight) widths = [ cv2.boundingRect(contour)[2] for contour in contours if cv2.boundingRect(contour)[3] > 2 * avgheight ] # collecting widths of contours with above average height avgwidth = sum(widths) / len(widths) # average width widths.sort() widths = list(dict.fromkeys(widths)) # remove duplicates trimmed_widths = stats.trim_mean(top_chunk(widths, 3),
def title(image_received): image = image_received # reading the image #step 1: Image to Binary gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #converting into greyscale (thresh, binary) = cv2.threshold( gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) #converting into binary image # Step 2: Contouring. #creating blank image same dimension as the given image. mask = np.ones(image.shape[:2], dtype="uint8") * 255 imghsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) mask_blue = cv2.inRange(imghsv, (0, 0, 0), (20, 20, 20)) (contours, _) = cv2.findContours( mask_blue, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) #finding contours i.e all letters #print(len(contours)) #collecting all the heights of each contours heights = [cv2.boundingRect(contour)[3] for contour in contours] #finding average height average_height = sum(heights) / len(heights) #drawing contours for contour in contours: #drawing rectangles around the countours in main image [x, y, w, h] = cv2.boundingRect(contour) if (h > 2 * average_height): #mask = cv2.rectangle(image, (x,y), (x+w, y+h), (0, 255, 0), 1) cv2.drawContours(mask, [contour], -1, 0, -1) ''' cv2.namedWindow('filter',cv2.WINDOW_NORMAL) cv2.imshow('filter', mask) #cv2.imwrite('headlines.jpg',mask) cv2.waitKey(0) cv2.destroyAllWindows() ''' #step 3: applying RLSA Horizontal on the image x, y = mask.shape value = max(math.ceil(x / 100), math.ceil(y / 100)) + 50 mask = rlsa.rlsa(mask, True, False, value) ''' cv2.namedWindow('rlsah',cv2.WINDOW_NORMAL) cv2.imshow('rlsah', mask) cv2.waitKey(0) cv2.destroyAllWindows() ''' #step 4: applying above image in main image #finding contours (contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) #blank image mask2 = np.ones(image.shape, dtype="uint8") * 255 for contour in contours: [x, y, w, h] = cv2.boundingRect(contour) if w > 0.60 * image.shape[1]: title = image[y:y + h, x:x + w] mask2[y:y + h, x:x + w] = title #copied title contour onto the blank image image[y:y + h, x:x + w] = 255 #nullified the contour on original image ''' cv2.namedWindow('title',cv2.WINDOW_NORMAL) cv2.imshow('title', mask2) #cv2.imwrite('headlines.jpg',mask) cv2.waitKey(0) cv2.destroyAllWindows() ''' extracted_title = pytesseract.image_to_string(mask2) return (extracted_title)