def do_split_marker_by_script(sfm,find_marker,script1,script2,new_marker1,new_marker2): #Find a given marker and split it by script. ad = AlphabetDetector() new_sfm = sfm count = 0 logging.info("\nIn do_split_marker_by_script code:\n") for i,entry in enumerate(new_sfm): for j , field in enumerate(entry): marker, data = field if marker == find_marker : count = count + 1 scripts = ad.detect_alphabet(data) script_count = len(scripts) if script_count == 1: script = next(iter(scripts)) new_field = [marker + '_' + script,data] new_sfm[i].insert(j+1,new_field) #logging.info("\nFound '{}' only containing {}. Adding new field: {}".format(data,script,new_field)) #logging.info(new_sfm[i]) elif script_count > 1: print("\nFound {} scripts: {}".format(len(scripts),scripts)) print("Data is {}".format(data)) for script_number,script in enumerate(scripts): string_list = [character for character in data if character == space or script in ad.detect_alphabet(character)] string = ''.join(string_list).strip() new_field = [marker + '_' + script,string] print("New_field is {}".format(new_field)) new_sfm[i].insert(j+script_number+1,new_field) print(new_sfm[i]) return new_sfm, count
def __freqs_dict(self, raw_text): t_start = time() print('Making filtered text...') stopset = set(stopwords.words('russian')) ad = AlphabetDetector() tokens = word_tokenize(raw_text) tokens_filtered = [w.lower() for w in tokens if w not in stopset and w not in self.__custom_stopwords and w.isalpha() and len(w) >= self.__min_word_len and ad.is_cyrillic(w)] freqs_tokenized_text = FreqDist(tokens_filtered) freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words)) res_text = '' for item in freqs_most_common.items(): word = item[0] freq = item[1] for i in range(freq): res_text += word + ' ' t_end = time() print("TIME = %.2f s" % (t_end - t_start)) return res_text
def load_json_newsletters(corpus_dir): alphabet_detector = AlphabetDetector() arb_corpus = list() eng_corpus = list() ids = list() json_files = glob.glob(corpus_dir + '/*.json') print('# of newsletters:', len(json_files)) for json_file in json_files: json_doc = json.loads(open(json_file).read()) try: j_articles = json_doc['articles'] # print('# of articles:', len(j_articles)) for e in j_articles: doc_id = j_articles[e]['id'] title = clean_text(j_articles[e]['title']) text = clean_text(j_articles[e]['body']) # print(text) link = j_articles[e]['link'] if text and 'ARABIC' in alphabet_detector.detect_alphabet(text): arb_corpus.append(text) else: eng_corpus.append(text) except KeyError: continue print('# of Arabic documents:', len(arb_corpus)) print('# of English documents:', len(eng_corpus)) return arb_corpus, eng_corpus
def extractor(): alpha_det = AlphabetDetector() url = request.args.get('url') if not url: return render_template('no_url.html') url = url.strip() # title = extract_util.get_title(url) title, newspaper3k_text = util.extract_newspaper3k(url) if 'ARABIC' in alpha_det.detect_alphabet(title): text_dir = 'rtl' lang = 'Arabic' else: text_dir = 'ltr' lang = 'English' date = util.get_date(url) text_justext = util.get_text_justext(url, lang) news_please_text = util.extract_news_please(url) # _, bs4_text = extract_util.get_title_text_BS4(url) text_pextract = util.get_pextract(url) texts = OrderedDict() texts['Justext'] = text_justext texts['Newspaper3k'] = newspaper3k_text texts['NewsPlease'] = news_please_text texts['pextract'] = text_pextract return render_template('article_info.html', url=url, title=title, date=date, text_dir=text_dir, texts=texts)
def Pic_Book(x): im1 = Image.open(r'C:\\Users\\user\\Desktop\\darlusail\\book.jpg') Pic_cell=sheet.cell(x,17).value Title_cell=str(sheet.cell(x,4).value) draw = ImageDraw.Draw(im1) font= ImageFont.truetype('arial.ttf', size=10) # desired size txt = arabic_reshaper.reshape(Title_cell)# starting position of the message message=get_display(txt) color = 'rgb(0, 0, 0)' # black color wrapper = textwrap3.TextWrapper(width=20) # draw the message on the background with text wrapper word_list = wrapper.wrap(text=message) caption_new = '' ad = AlphabetDetector() if (ad.is_arabic(txt)==True or ad.is_latin(txt)!=True):#checks if the text whatever it is has Arabic words print(" the name contains arabic words") for ii in reversed(word_list): caption_new = caption_new + ii + '\n' if(ad.is_latin(txt)==True): print(" the name doesn't contains arabic words") for ii in word_list: caption_new = caption_new + ii + '\n' print(caption_new) font= ImageFont.truetype('arial.ttf', size=10)#define the font for the text color = 'rgb(0, 0, 0)' #w,h = draw.textsize(message, font=font) w,h = draw.textsize(caption_new, font=font) W,H = im1.size #x,y = 0.5*(W-w),0.90*H-h x,y=0.5*(W-w),0.5*H draw.text((x,y), caption_new, fill=color,font=font) size=563,285 #define the dimensions to crop in the next step im1=im1.resize(size,Image.ANTIALIAS) im2=im1.save(r"C:"+Pic_cell) #save the image with the new configurations
def process_word(word): ad = AlphabetDetector() if not ad.is_latin(word): return '' banned_fragments = [ '.html', 'http://', 'https://', '.jpg', '.svg', '.png' ] for banned in banned_fragments: if banned in word: return '' word = word.lower() word = PorterStemmer().stem(word) chars = list(punctuation) chars = chars + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] chars = chars + ['®'] for char in chars: word = word.replace(char, '') stop_words = list(stopwords.words('english')) stop_words += ['aa', 'aaa', 'bb', 'bbb'] for stop in stop_words: if stop == word: return '' if len(set(word)) == 1: return '' return word
def parse_input(bot, update, user_data): input = update.message.text ad = AlphabetDetector() if ad.is_latin(input): update.message.reply_text( "Right now only cyrillic characters are supported :( \n" "Enter a Kyrgyz word:") else: result = run_apertium_tagger(input) if user_data.setdefault("stem", []): user_data["stem"] = [] result_2 = str(result[2]).replace("J", "Й") result_2 = result_2.replace("j", "й") user_data['stem'].append(result_2.lower()) if result[3] is not None: user_data['stem'].append(1) user_data['stem'].append(result[3]) reply = result_2 + "*" + result[3] + "*" update.message.reply_text(reply, parse_mode=ParseMode.MARKDOWN) update.message.reply_text( "Look up stem in the dictionary? \n" "Press /find or enter next word to continue.") else: user_data['stem'].append(0) update.message.reply_text(result_2) update.message.reply_text( "Parsing didn't work :( \n" "Look up stem in the dictionary? \n" "Press /find or enter next word to continue.")
def parse_text(bot, update): input = update.message.text ad = AlphabetDetector() if ad.is_latin(input): update.message.reply_text( "Right now only cyrillic characters are supported :( \n" "Enter a Kyrgyz text:") else: match_list = run_apertium_tagger(input, mode="text") output_list, error_list = [], [] for word_match in match_list: if not word_match: continue else: word_match_1 = str(word_match[1]).replace("J", "Й") word_match_1 = word_match_1.replace("j", "й") if word_match[2] is not None: output_list.append(word_match_1 + "*" + word_match[2] + "*") else: output_list.append("_" + word_match_1 + "_") error_list.append(word_match_1) update.message.reply_text(" ".join(output_list), parse_mode=ParseMode.MARKDOWN) if error_list: update.message.reply_text( "⚠️ These words were not recognized by the parser ⚠️") for word in error_list: update.message.reply_text(word) update.message.reply_text("*-----*", parse_mode=ParseMode.MARKDOWN)
def get_none_arabic_words(text): none_arabic = list() ad = AlphabetDetector() for word in text.split(): if not ad.is_arabic(word): none_arabic.append(word) return none_arabic
def test_locale(self): ad = AlphabetDetector() faker = Faker('ru_RU') seeder = Seeder(faker) seeder.add_entity(Game, 5) seeder.execute() self.assertTrue(all([ad.is_cyrillic(game.title) for game in Game.objects.all()]))
def detect_alphabet(str): ad = AlphabetDetector() uni_string = unicode(str, "utf-8") ab = ad.detect_alphabet(uni_string) if "CYRILLIC" in ab: return "CYRILLIC" return ab.pop() if len(ab) != 0 else 'UND'
def main(): # catch CTRL-C signal.signal(signal.SIGINT, sigint_handler) #setup logging LOG.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') log_file = logging.FileHandler(LOG_FILE) log_file.setLevel(logging.INFO) log_file.setFormatter(formatter) LOG.addHandler(log_file) log_stdout = logging.StreamHandler() log_stdout.setLevel(logging.INFO) log_stdout.setFormatter(formatter) LOG.addHandler(log_stdout) LOG.info('Yatranslator is starting!') bot = TranslatorCore(AlphabetDetector(), os.environ['TELE_TOKEN'], os.environ['YA_API_KEY'], os.environ['ADMIN_USERNAME']) while True: bot.run() time.sleep(1)
def check_case(file): lists = file.readlines() ad = AlphabetDetector() for word in lists: if word.isalpha() and ad.is_latin(word) and word.islower() == False: print(word) return "Please repair the case to lowercase in " + file.name + "." return "PASS"
def isArabic(s): ad = AlphabetDetector() string_without_numbers = str(s).translate(None, string.digits) if string_without_numbers == '': return False else: return ad.only_alphabet_chars(unicode(string_without_numbers), 'ARABIC')
def check_alphabet(str, alphabet, only=True): ad = AlphabetDetector() if only: return ad.only_alphabet_chars(str, alphabet.upper()) else: for i in str: if ad.is_in_alphabet(i, alphabet.upper()): return True return False
def validate_password(password): ad = AlphabetDetector() if len(password) <= 3: tk.messagebox.showerror('Information', 'Password too short (at least 4 symbols)') return False elif not ad.only_alphabet_chars(password, 'LATIN'): tk.messagebox.showerror('Information', 'Password must contain latin chars and/or numbers') return False return True
def check_alphabet(str, alphabet, only=True): ad = AlphabetDetector() uni_string = unicode(str, "utf-8") if only: return ad.only_alphabet_chars(uni_string, alphabet.upper()) else: for i in uni_string: if ad.is_in_alphabet(i, alphabet.upper()): return True return False
def drop_cols_not_in_mapping(df, mapper): ad = AlphabetDetector() for header in list(df.columns): if ad.is_hebrew(header) and clean_text(header) not in list( mapper.keys()): df = drop_col_if_exists(df, header) if ad.is_latin(header) and header not in list(mapper.values()): df = drop_col_if_exists(df, header) return df
def detect_alphabet(lstr): ad = AlphabetDetector() lalphabets = [] for l in lstr: ab = ad.detect_alphabet(l) if "CYRILLIC" in ab: lalphabets.append("CYRILLIC") else: lalphabets.append(ab.pop() if len(ab) != 0 else 'UND') return lalphabets
def keep_text_with_diacritics(text): ad = AlphabetDetector() lines = text.split('\n') result_list = list() for line in lines: clean_line = list() for word in line.split(): if ad.is_arabic(word) and has_diacritics(word): clean_line.append(word) result_list.append(' '.join(clean_line)) return '\n'.join(result_list)
def _test_caps_to_camel_case(a, b): ad = AlphabetDetector() if type(a) is not str: assert a == b elif type(a) is str and not ad.is_latin(a): assert a == b elif type(a) is str and len(a) < 4 or a.upper() != a: assert a == b else: assert len(a) == len(b) assert a != b
def keep_only_arabic(text): ad = AlphabetDetector() clean_lines = list() for line in text.splitlines(): clean_line = list() for word in line.split(): if ad.is_arabic(word): if word.isalpha(): clean_line.append(word) clean_lines.append(' '.join(clean_line)) return '\n'.join(clean_lines)
def getDescription(self, photoInfo): from alphabet_detector import AlphabetDetector ad = AlphabetDetector() text = u'' descriptionString = photoInfo['photo']['description']['_content'] for line in descriptionString.splitlines(): if not 'CYRILLIC' in ad.detect_alphabet(line): text += "\n" + line return text
def make_mt_data_from_master(bigrams=False): ad = AlphabetDetector() source = codecs.open("data/mt/source.txt", "w", encoding="utf-8") target = codecs.open("data/mt/target.txt", "w", encoding="utf-8") source_valid = codecs.open("data/mt/source_valid.txt", "w", encoding="utf-8") target_valid = codecs.open("data/mt/target_valid.txt", "w", encoding="utf-8") mapped_titles = json.load( codecs.open("data/master-generated-titles-filtered.json", "r", encoding="utf-8")) keys = list(mapped_titles.keys()) random.shuffle(keys) i = 0 for key in keys: orig = clean_text(key.replace(".", "")) humorous_ones = mapped_titles[key] if "output" not in humorous_ones: continue for humorous in humorous_ones["output"]: i += 1 if "ARABIC" in ad.detect_alphabet(humorous): #skip the ones with arabic characters continue humorous = clean_text(humorous.replace(" .", "")) if not bigrams: target.write(humorous + "\n") source.write(orig + "\n") if i % 4 == 0: target_valid.write(humorous + "\n") source_valid.write(orig + "\n") if bigrams: source_grams, target_grams = __make_bigram_lists( orig, humorous) for x in range(len(source_grams)): source_gram = " ".join(source_grams[x]) target_gram = " ".join(target_grams[x]) source.write(source_gram + "\n") target.write(target_gram + "\n") if i % 10 == 0: source_valid.write(source_gram + "\n") target_valid.write(target_gram + "\n") source.close() target.close() source_valid.close() target_valid.close()
def kor2en(str): ad = AlphabetDetector() inputTitle = str outputTitle = "" # set invalid chars except . for extension invalidChars = set(string.punctuation.replace(".", "")) # replace invalid chars for i in range(len(inputTitle)): if inputTitle[i] not in invalidChars: outputTitle += inputTitle[i] i+=1 if not ad.only_alphabet_chars(outputTitle,"LATIN"): transliter = Transliter(academic) outputTitle = transliter.translit(outputTitle) return outputTitle
def get_words_cases(self, words: Sequence[str]) -> List[str]: pluralized_words = [] alphabet_detector = AlphabetDetector() if isinstance(words, str): words = [words] for word in words: alphabets = alphabet_detector.detect_alphabet(word) if 'LATIN' in alphabets: pluralized_words.append(inflection.pluralize(word)) elif 'CYRILLIC' in alphabets: pluralized_words += self.get_cyrillic_word_cases(word) else: self.logger.warn( 'Unsupported language for text: {}'.format(word)) return pluralized_words
def validate_username(username): ad = AlphabetDetector() if not ad.only_alphabet_chars(username, 'LATIN'): tk.messagebox.showerror('Information', 'Username must contain latin chars and/or numbers') return False try: with open(sys.path[1] + '\\users\\users.txt', 'r') as credentials: for line in credentials: line = line.split(':') if line[0] == username: tk.messagebox.showerror('Information', 'Username already exists') return False return True except FileNotFoundError: print('users.txt file not found') return False
def detect_language(s): try: replace_list = ['–', '•'] for x in replace_list: if x in s: s = s.replace(x, '') s.encode('ascii') except UnicodeEncodeError: ad = AlphabetDetector() lang = ad.detect_alphabet(s) if not ('ARABIC' in lang): return 'other' return 'fa' else: return 'en'
def return_split_text_by_characterencode(orig_sentence): ad = AlphabetDetector() character_coding_list = ad.detect_alphabet(orig_sentence) for character_coding in character_coding_list: print(character_coding) if 'HIRAGANA' in character_coding or 'KATAKANA' in character_coding or 'CJK' in character_coding: text_list = cut_text(f'{orig_sentence}', 43) final_text = '' for text in text_list: final_text += f'{text}\n' else: text_list = cut_text(f'{orig_sentence}', 88) final_text = '' for text in text_list: final_text += f'{text}\n' return final_text
def create_marc_seq_file(self): """ function to transform a MARC formatted Dataframe into a MARC sequantial file """ logger = logging.getLogger(__name__) logger.info( f"[MARC Sequantial] Creating MARC sequantial file for {self.collection_id}" ) df = self.marc_data ad = AlphabetDetector() output_file_name = self.data_path_processed / ( self.collection_id + "_final_" + self.dt_now + ".txt") with open(output_file_name, "w", encoding="utf8") as f: for index, row in df.iterrows(): f.write(f"{index} LDR {row['LDR']}\n") f.write(f"{index} 001 {index}\n") for col in df: # if field is empty, skip if str(row[col]) == "" or col == "LDR": continue # # check language # lang = ad.detect_alphabet(str(row[col])) # if "HEBREW" in lang: # lang = "H" # else: # lang = "L" # construct 5 character field code if "_" in col: col_name = "{:<5}".format(col[:col.find("_")]) else: col_name = "{:<5}".format(col) # construct the line for the MARC sequantial file line = f"{index} {col_name} {str(row[col])}\n" # if col_name == '035': line = line.replace("$$$$", "$$") line = line.replace("$$a$$a", "$$") # write to file f.write(line)