def Pic_Book(x): im1 = Image.open(r'C:\\Users\\user\\Desktop\\darlusail\\book.jpg') Pic_cell=sheet.cell(x,17).value Title_cell=str(sheet.cell(x,4).value) draw = ImageDraw.Draw(im1) font= ImageFont.truetype('arial.ttf', size=10) # desired size txt = arabic_reshaper.reshape(Title_cell)# starting position of the message message=get_display(txt) color = 'rgb(0, 0, 0)' # black color wrapper = textwrap3.TextWrapper(width=20) # draw the message on the background with text wrapper word_list = wrapper.wrap(text=message) caption_new = '' ad = AlphabetDetector() if (ad.is_arabic(txt)==True or ad.is_latin(txt)!=True):#checks if the text whatever it is has Arabic words print(" the name contains arabic words") for ii in reversed(word_list): caption_new = caption_new + ii + '\n' if(ad.is_latin(txt)==True): print(" the name doesn't contains arabic words") for ii in word_list: caption_new = caption_new + ii + '\n' print(caption_new) font= ImageFont.truetype('arial.ttf', size=10)#define the font for the text color = 'rgb(0, 0, 0)' #w,h = draw.textsize(message, font=font) w,h = draw.textsize(caption_new, font=font) W,H = im1.size #x,y = 0.5*(W-w),0.90*H-h x,y=0.5*(W-w),0.5*H draw.text((x,y), caption_new, fill=color,font=font) size=563,285 #define the dimensions to crop in the next step im1=im1.resize(size,Image.ANTIALIAS) im2=im1.save(r"C:"+Pic_cell) #save the image with the new configurations
def process_word(word): ad = AlphabetDetector() if not ad.is_latin(word): return '' banned_fragments = [ '.html', 'http://', 'https://', '.jpg', '.svg', '.png' ] for banned in banned_fragments: if banned in word: return '' word = word.lower() word = PorterStemmer().stem(word) chars = list(punctuation) chars = chars + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] chars = chars + ['®'] for char in chars: word = word.replace(char, '') stop_words = list(stopwords.words('english')) stop_words += ['aa', 'aaa', 'bb', 'bbb'] for stop in stop_words: if stop == word: return '' if len(set(word)) == 1: return '' return word
def parse_text(bot, update): input = update.message.text ad = AlphabetDetector() if ad.is_latin(input): update.message.reply_text( "Right now only cyrillic characters are supported :( \n" "Enter a Kyrgyz text:") else: match_list = run_apertium_tagger(input, mode="text") output_list, error_list = [], [] for word_match in match_list: if not word_match: continue else: word_match_1 = str(word_match[1]).replace("J", "Й") word_match_1 = word_match_1.replace("j", "й") if word_match[2] is not None: output_list.append(word_match_1 + "*" + word_match[2] + "*") else: output_list.append("_" + word_match_1 + "_") error_list.append(word_match_1) update.message.reply_text(" ".join(output_list), parse_mode=ParseMode.MARKDOWN) if error_list: update.message.reply_text( "⚠️ These words were not recognized by the parser ⚠️") for word in error_list: update.message.reply_text(word) update.message.reply_text("*-----*", parse_mode=ParseMode.MARKDOWN)
def parse_input(bot, update, user_data): input = update.message.text ad = AlphabetDetector() if ad.is_latin(input): update.message.reply_text( "Right now only cyrillic characters are supported :( \n" "Enter a Kyrgyz word:") else: result = run_apertium_tagger(input) if user_data.setdefault("stem", []): user_data["stem"] = [] result_2 = str(result[2]).replace("J", "Й") result_2 = result_2.replace("j", "й") user_data['stem'].append(result_2.lower()) if result[3] is not None: user_data['stem'].append(1) user_data['stem'].append(result[3]) reply = result_2 + "*" + result[3] + "*" update.message.reply_text(reply, parse_mode=ParseMode.MARKDOWN) update.message.reply_text( "Look up stem in the dictionary? \n" "Press /find or enter next word to continue.") else: user_data['stem'].append(0) update.message.reply_text(result_2) update.message.reply_text( "Parsing didn't work :( \n" "Look up stem in the dictionary? \n" "Press /find or enter next word to continue.")
def check_case(file): lists = file.readlines() ad = AlphabetDetector() for word in lists: if word.isalpha() and ad.is_latin(word) and word.islower() == False: print(word) return "Please repair the case to lowercase in " + file.name + "." return "PASS"
def drop_cols_not_in_mapping(df, mapper): ad = AlphabetDetector() for header in list(df.columns): if ad.is_hebrew(header) and clean_text(header) not in list( mapper.keys()): df = drop_col_if_exists(df, header) if ad.is_latin(header) and header not in list(mapper.values()): df = drop_col_if_exists(df, header) return df
def _test_caps_to_camel_case(a, b): ad = AlphabetDetector() if type(a) is not str: assert a == b elif type(a) is str and not ad.is_latin(a): assert a == b elif type(a) is str and len(a) < 4 or a.upper() != a: assert a == b else: assert len(a) == len(b) assert a != b
def isLatin(string): try: ad = AlphabetDetector() return ad.is_latin(string) except: return False
pre, ext = os.path.splitext(file) with open(pre+".pdfdata", 'w') as target_file: sentences = [] for paragraph in txt2paragraph(file): paragraph=paragraph.replace('"',' ') paragraph=paragraph.replace('”',' ') paragraph=paragraph.replace('“',' ') paragraph=paragraph.replace('•',' ') paragraph=paragraph.replace('?','.') paragraph=paragraph.replace('!','.') paragraph=paragraph.replace(';',' ') paragraph=paragraph.replace(':',' ') paragraph=paragraph.replace("’","'") paragraph=' '.join(paragraph.split()) temp = nltk.sent_tokenize(paragraph) sentences.extend(temp) if len(temp)>1 and temp[-1].endswith('.'): paragraph = "" for sentence in sentences: sentence = filter_non_printable(sentence) if ad.is_latin(sentence) and not re.search(r"([\s.!#?^@-\\\|\*,]+\w[\s.!#?^@-\\\|\*,]+)+\w[\s.!#?^@-\\\|\*,]+",sentence) and not re.search(r"\/\d+|\d+\/",sentence): if paragraph!="": paragraph+=" " paragraph += sentence if paragraph is not "": target_file.write(paragraph + "\n") # else: # print(sentences) sentences.clear() os.remove(file)