Ejemplo n.º 1
0
def do_split_marker_by_script(sfm,find_marker,script1,script2,new_marker1,new_marker2):
#Find a given marker and split it by script.

	ad = AlphabetDetector()
	new_sfm = sfm
	count = 0
	logging.info("\nIn do_split_marker_by_script code:\n")
	
	for i,entry in enumerate(new_sfm):
		for j , field in enumerate(entry):
			marker, data = field
			if marker == find_marker :
				count = count + 1
				scripts = ad.detect_alphabet(data)
				script_count = len(scripts)
				
				if script_count == 1:
					script = next(iter(scripts))
					new_field = [marker + '_' + script,data]
					new_sfm[i].insert(j+1,new_field)
					#logging.info("\nFound '{}' only containing {}. Adding new field: {}".format(data,script,new_field))
					#logging.info(new_sfm[i])
					
				elif script_count > 1:
					print("\nFound {} scripts: {}".format(len(scripts),scripts))
					print("Data is {}".format(data))
					for script_number,script in enumerate(scripts):
						string_list = [character for character in data if character == space or script in ad.detect_alphabet(character)]
						string = ''.join(string_list).strip()
						new_field = [marker + '_' + script,string]
						print("New_field is {}".format(new_field))
						new_sfm[i].insert(j+script_number+1,new_field)
					print(new_sfm[i])
	return new_sfm, count
Ejemplo n.º 2
0
    def __freqs_dict(self, raw_text):

        t_start = time()
        print('Making filtered text...')

        stopset = set(stopwords.words('russian'))
        ad = AlphabetDetector()

        tokens = word_tokenize(raw_text)
        tokens_filtered = [w.lower() for w in tokens
                           if w not in stopset
                           and w not in self.__custom_stopwords
                           and w.isalpha()
                           and len(w) >= self.__min_word_len
                           and ad.is_cyrillic(w)]


        freqs_tokenized_text = FreqDist(tokens_filtered)
        freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words))

        res_text = ''
        for item in freqs_most_common.items():
            word = item[0]
            freq = item[1]
            for i in range(freq):
                res_text += word + ' '

        t_end = time()
        print("TIME = %.2f s" % (t_end - t_start))

        return res_text
Ejemplo n.º 3
0
def load_json_newsletters(corpus_dir):
    alphabet_detector = AlphabetDetector()
    arb_corpus = list()
    eng_corpus = list()
    ids = list()
    json_files = glob.glob(corpus_dir + '/*.json')
    print('# of newsletters:', len(json_files))
    for json_file in json_files:
        json_doc = json.loads(open(json_file).read())
        try:
            j_articles = json_doc['articles']
            # print('# of articles:', len(j_articles))
            for e in j_articles:
                doc_id = j_articles[e]['id']
                title = clean_text(j_articles[e]['title'])
                text = clean_text(j_articles[e]['body'])
                # print(text)
                link = j_articles[e]['link']
                if text and 'ARABIC' in alphabet_detector.detect_alphabet(text):
                    arb_corpus.append(text)
                else:
                    eng_corpus.append(text)
        except KeyError:
            continue

    print('# of Arabic documents:', len(arb_corpus))
    print('# of English documents:', len(eng_corpus))
    return arb_corpus, eng_corpus
Ejemplo n.º 4
0
def extractor():
    alpha_det = AlphabetDetector()
    url = request.args.get('url')
    if not url:
        return render_template('no_url.html')
    url = url.strip()
    # title = extract_util.get_title(url)
    title, newspaper3k_text = util.extract_newspaper3k(url)

    if 'ARABIC' in alpha_det.detect_alphabet(title):
        text_dir = 'rtl'
        lang = 'Arabic'
    else:
        text_dir = 'ltr'
        lang = 'English'

    date = util.get_date(url)
    text_justext = util.get_text_justext(url, lang)
    news_please_text = util.extract_news_please(url)
    # _, bs4_text = extract_util.get_title_text_BS4(url)
    text_pextract = util.get_pextract(url)

    texts = OrderedDict()
    texts['Justext'] = text_justext
    texts['Newspaper3k'] = newspaper3k_text
    texts['NewsPlease'] = news_please_text
    texts['pextract'] = text_pextract

    return render_template('article_info.html',
                           url=url,
                           title=title,
                           date=date,
                           text_dir=text_dir,
                           texts=texts)
Ejemplo n.º 5
0
def Pic_Book(x):
 im1 = Image.open(r'C:\\Users\\user\\Desktop\\darlusail\\book.jpg')  
 Pic_cell=sheet.cell(x,17).value
 Title_cell=str(sheet.cell(x,4).value)
 draw = ImageDraw.Draw(im1)
 font= ImageFont.truetype('arial.ttf', size=10) # desired size
 txt = arabic_reshaper.reshape(Title_cell)# starting position of the message
 message=get_display(txt)
 color = 'rgb(0, 0, 0)' # black color
 wrapper = textwrap3.TextWrapper(width=20) # draw the message on the background with text wrapper
 word_list = wrapper.wrap(text=message) 
 caption_new = ''
 ad = AlphabetDetector()
 if (ad.is_arabic(txt)==True or ad.is_latin(txt)!=True):#checks if the text whatever it is has Arabic words
     print(" the name contains arabic words")
     for ii in reversed(word_list):
         caption_new = caption_new + ii + '\n'
 if(ad.is_latin(txt)==True):
     print(" the name doesn't contains arabic words")
     for ii in word_list:
         caption_new = caption_new + ii + '\n'
 print(caption_new)
 font= ImageFont.truetype('arial.ttf', size=10)#define the font for the text
 color = 'rgb(0, 0, 0)'
 #w,h = draw.textsize(message, font=font)
 w,h = draw.textsize(caption_new, font=font)
 W,H = im1.size
 #x,y = 0.5*(W-w),0.90*H-h
 x,y=0.5*(W-w),0.5*H
 draw.text((x,y), caption_new, fill=color,font=font)
 size=563,285 #define the dimensions to crop in the next step
 im1=im1.resize(size,Image.ANTIALIAS)
 im2=im1.save(r"C:"+Pic_cell) #save the image with the new configurations
Ejemplo n.º 6
0
    def process_word(word):
        ad = AlphabetDetector()
        if not ad.is_latin(word):
            return ''

        banned_fragments = [
            '.html', 'http://', 'https://', '.jpg', '.svg', '.png'
        ]
        for banned in banned_fragments:
            if banned in word:
                return ''

        word = word.lower()
        word = PorterStemmer().stem(word)
        chars = list(punctuation)
        chars = chars + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
        chars = chars + ['®']
        for char in chars:
            word = word.replace(char, '')

        stop_words = list(stopwords.words('english'))
        stop_words += ['aa', 'aaa', 'bb', 'bbb']
        for stop in stop_words:
            if stop == word:
                return ''

        if len(set(word)) == 1:
            return ''

        return word
Ejemplo n.º 7
0
def parse_input(bot, update, user_data):
    input = update.message.text
    ad = AlphabetDetector()
    if ad.is_latin(input):
        update.message.reply_text(
            "Right now only cyrillic characters are supported :( \n"
            "Enter a Kyrgyz word:")
    else:
        result = run_apertium_tagger(input)
        if user_data.setdefault("stem", []):
            user_data["stem"] = []
        result_2 = str(result[2]).replace("J", "Й")
        result_2 = result_2.replace("j", "й")
        user_data['stem'].append(result_2.lower())

        if result[3] is not None:
            user_data['stem'].append(1)
            user_data['stem'].append(result[3])
            reply = result_2 + "*" + result[3] + "*"
            update.message.reply_text(reply, parse_mode=ParseMode.MARKDOWN)
            update.message.reply_text(
                "Look up stem in the dictionary? \n"
                "Press /find or enter next word to continue.")
        else:
            user_data['stem'].append(0)
            update.message.reply_text(result_2)
            update.message.reply_text(
                "Parsing didn't work :( \n"
                "Look up stem in the dictionary? \n"
                "Press /find or enter next word to continue.")
Ejemplo n.º 8
0
def parse_text(bot, update):
    input = update.message.text
    ad = AlphabetDetector()
    if ad.is_latin(input):
        update.message.reply_text(
            "Right now only cyrillic characters are supported :( \n"
            "Enter a Kyrgyz text:")
    else:
        match_list = run_apertium_tagger(input, mode="text")
        output_list, error_list = [], []
        for word_match in match_list:
            if not word_match:
                continue
            else:
                word_match_1 = str(word_match[1]).replace("J", "Й")
                word_match_1 = word_match_1.replace("j", "й")

                if word_match[2] is not None:
                    output_list.append(word_match_1 + "*" + word_match[2] +
                                       "*")
                else:
                    output_list.append("_" + word_match_1 + "_")
                    error_list.append(word_match_1)

        update.message.reply_text(" ".join(output_list),
                                  parse_mode=ParseMode.MARKDOWN)
        if error_list:
            update.message.reply_text(
                "⚠️ These words were not recognized by the parser ⚠️")
            for word in error_list:
                update.message.reply_text(word)
        update.message.reply_text("*-----*", parse_mode=ParseMode.MARKDOWN)
Ejemplo n.º 9
0
def get_none_arabic_words(text):
    none_arabic = list()
    ad = AlphabetDetector()
    for word in text.split():
        if not ad.is_arabic(word):
            none_arabic.append(word)
    return none_arabic
Ejemplo n.º 10
0
 def test_locale(self):
     ad = AlphabetDetector()
     faker = Faker('ru_RU')
     seeder = Seeder(faker)
     seeder.add_entity(Game, 5)
     seeder.execute()
     self.assertTrue(all([ad.is_cyrillic(game.title) for game in Game.objects.all()]))
Ejemplo n.º 11
0
def detect_alphabet(str):
    ad = AlphabetDetector()
    uni_string = unicode(str, "utf-8")
    ab = ad.detect_alphabet(uni_string)
    if "CYRILLIC" in ab:
        return "CYRILLIC"
    return ab.pop() if len(ab) != 0 else 'UND'
Ejemplo n.º 12
0
def main():
    # catch CTRL-C
    signal.signal(signal.SIGINT, sigint_handler)

    #setup logging
    LOG.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    log_file = logging.FileHandler(LOG_FILE)
    log_file.setLevel(logging.INFO)
    log_file.setFormatter(formatter)
    LOG.addHandler(log_file)

    log_stdout = logging.StreamHandler()
    log_stdout.setLevel(logging.INFO)
    log_stdout.setFormatter(formatter)
    LOG.addHandler(log_stdout)

    LOG.info('Yatranslator is starting!')

    bot = TranslatorCore(AlphabetDetector(), os.environ['TELE_TOKEN'],
                         os.environ['YA_API_KEY'],
                         os.environ['ADMIN_USERNAME'])
    while True:
        bot.run()
        time.sleep(1)
Ejemplo n.º 13
0
def check_case(file):
    lists = file.readlines()
    ad = AlphabetDetector()
    for word in lists:
        if word.isalpha() and ad.is_latin(word) and word.islower() == False:
            print(word)
            return "Please repair the case to lowercase in " + file.name + "."
    return "PASS"
Ejemplo n.º 14
0
def isArabic(s):
    ad = AlphabetDetector()
    string_without_numbers = str(s).translate(None, string.digits)
    if string_without_numbers == '':
        return False
    else:
        return ad.only_alphabet_chars(unicode(string_without_numbers),
                                      'ARABIC')
Ejemplo n.º 15
0
def check_alphabet(str, alphabet, only=True):
    ad = AlphabetDetector()
    if only:
        return ad.only_alphabet_chars(str, alphabet.upper())
    else:
        for i in str:
            if ad.is_in_alphabet(i, alphabet.upper()): return True
        return False
Ejemplo n.º 16
0
 def validate_password(password):
     ad = AlphabetDetector()
     if len(password) <= 3:
         tk.messagebox.showerror('Information', 'Password too short (at least 4 symbols)')
         return False
     elif not ad.only_alphabet_chars(password, 'LATIN'):
         tk.messagebox.showerror('Information', 'Password must contain latin chars and/or numbers')
         return False
     return True
Ejemplo n.º 17
0
def check_alphabet(str, alphabet, only=True):
    ad = AlphabetDetector()
    uni_string = unicode(str, "utf-8")
    if only:
        return ad.only_alphabet_chars(uni_string, alphabet.upper())
    else:
        for i in uni_string:
            if ad.is_in_alphabet(i, alphabet.upper()): return True
        return False
Ejemplo n.º 18
0
def drop_cols_not_in_mapping(df, mapper):
    ad = AlphabetDetector()
    for header in list(df.columns):
        if ad.is_hebrew(header) and clean_text(header) not in list(
                mapper.keys()):
            df = drop_col_if_exists(df, header)

        if ad.is_latin(header) and header not in list(mapper.values()):
            df = drop_col_if_exists(df, header)
    return df
Ejemplo n.º 19
0
def detect_alphabet(lstr):
    ad = AlphabetDetector()
    lalphabets = []
    for l in lstr:
        ab = ad.detect_alphabet(l)
        if "CYRILLIC" in ab:
            lalphabets.append("CYRILLIC")
        else:
            lalphabets.append(ab.pop() if len(ab) != 0 else 'UND')
    return lalphabets
Ejemplo n.º 20
0
def keep_text_with_diacritics(text):
    ad = AlphabetDetector()
    lines = text.split('\n')
    result_list = list()
    for line in lines:
        clean_line = list()
        for word in line.split():
            if ad.is_arabic(word) and has_diacritics(word):
                clean_line.append(word)
        result_list.append(' '.join(clean_line))
    return '\n'.join(result_list)
Ejemplo n.º 21
0
def _test_caps_to_camel_case(a, b):
    ad = AlphabetDetector()
    if type(a) is not str:
        assert a == b
    elif type(a) is str and not ad.is_latin(a):
        assert a == b
    elif type(a) is str and len(a) < 4 or a.upper() != a:
        assert a == b
    else:
        assert len(a) == len(b)
        assert a != b
Ejemplo n.º 22
0
def keep_only_arabic(text):
    ad = AlphabetDetector()
    clean_lines = list()
    for line in text.splitlines():
        clean_line = list()
        for word in line.split():
            if ad.is_arabic(word):
                if word.isalpha():
                    clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    return '\n'.join(clean_lines)
Ejemplo n.º 23
0
    def getDescription(self, photoInfo):

        from alphabet_detector import AlphabetDetector
        ad = AlphabetDetector()

        text = u''

        descriptionString = photoInfo['photo']['description']['_content']
        for line in descriptionString.splitlines():
            if not 'CYRILLIC' in ad.detect_alphabet(line):
                text += "\n" + line

        return text
Ejemplo n.º 24
0
def make_mt_data_from_master(bigrams=False):
    ad = AlphabetDetector()
    source = codecs.open("data/mt/source.txt", "w", encoding="utf-8")
    target = codecs.open("data/mt/target.txt", "w", encoding="utf-8")
    source_valid = codecs.open("data/mt/source_valid.txt",
                               "w",
                               encoding="utf-8")
    target_valid = codecs.open("data/mt/target_valid.txt",
                               "w",
                               encoding="utf-8")
    mapped_titles = json.load(
        codecs.open("data/master-generated-titles-filtered.json",
                    "r",
                    encoding="utf-8"))
    keys = list(mapped_titles.keys())
    random.shuffle(keys)
    i = 0
    for key in keys:
        orig = clean_text(key.replace(".", ""))
        humorous_ones = mapped_titles[key]
        if "output" not in humorous_ones:
            continue
        for humorous in humorous_ones["output"]:
            i += 1
            if "ARABIC" in ad.detect_alphabet(humorous):
                #skip the ones with arabic characters
                continue
            humorous = clean_text(humorous.replace(" .", ""))
            if not bigrams:
                target.write(humorous + "\n")
                source.write(orig + "\n")
                if i % 4 == 0:
                    target_valid.write(humorous + "\n")
                    source_valid.write(orig + "\n")
            if bigrams:
                source_grams, target_grams = __make_bigram_lists(
                    orig, humorous)
                for x in range(len(source_grams)):
                    source_gram = " ".join(source_grams[x])
                    target_gram = " ".join(target_grams[x])
                    source.write(source_gram + "\n")
                    target.write(target_gram + "\n")
                    if i % 10 == 0:
                        source_valid.write(source_gram + "\n")
                        target_valid.write(target_gram + "\n")
    source.close()
    target.close()
    source_valid.close()
    target_valid.close()
Ejemplo n.º 25
0
def kor2en(str):
    ad = AlphabetDetector()
    inputTitle = str
    outputTitle = ""
    # set invalid chars except . for extension
    invalidChars = set(string.punctuation.replace(".", ""))
    # replace invalid chars
    for i in range(len(inputTitle)):
        if inputTitle[i] not in invalidChars:
            outputTitle += inputTitle[i]
        i+=1
    if not ad.only_alphabet_chars(outputTitle,"LATIN"):
        transliter = Transliter(academic)
        outputTitle = transliter.translit(outputTitle)
    return outputTitle
Ejemplo n.º 26
0
 def get_words_cases(self, words: Sequence[str]) -> List[str]:
     pluralized_words = []
     alphabet_detector = AlphabetDetector()
     if isinstance(words, str):
         words = [words]
     for word in words:
         alphabets = alphabet_detector.detect_alphabet(word)
         if 'LATIN' in alphabets:
             pluralized_words.append(inflection.pluralize(word))
         elif 'CYRILLIC' in alphabets:
             pluralized_words += self.get_cyrillic_word_cases(word)
         else:
             self.logger.warn(
                 'Unsupported language for text: {}'.format(word))
     return pluralized_words
Ejemplo n.º 27
0
 def validate_username(username):
     ad = AlphabetDetector()
     if not ad.only_alphabet_chars(username, 'LATIN'):
         tk.messagebox.showerror('Information', 'Username must contain latin chars and/or numbers')
         return False
     try:
         with open(sys.path[1] + '\\users\\users.txt', 'r') as credentials:
             for line in credentials:
                 line = line.split(':')
                 if line[0] == username:
                     tk.messagebox.showerror('Information', 'Username already exists')
                     return False
         return True
     except FileNotFoundError:
         print('users.txt file not found')
         return False
Ejemplo n.º 28
0
def detect_language(s):
    try:
        replace_list = ['–', '•']
        for x in replace_list:
            if x in s:
                s = s.replace(x, '')
        s.encode('ascii')

    except UnicodeEncodeError:
        ad = AlphabetDetector()
        lang = ad.detect_alphabet(s)
        if not ('ARABIC' in lang):
            return 'other'
        return 'fa'
    else:
        return 'en'
Ejemplo n.º 29
0
def return_split_text_by_characterencode(orig_sentence):
    ad = AlphabetDetector()
    character_coding_list = ad.detect_alphabet(orig_sentence)
    for character_coding in character_coding_list:
        print(character_coding)
        if 'HIRAGANA' in character_coding or 'KATAKANA' in character_coding or 'CJK' in character_coding:
            text_list = cut_text(f'{orig_sentence}', 43)
            final_text = ''
            for text in text_list:
                final_text += f'{text}\n'
        else:
            text_list = cut_text(f'{orig_sentence}', 88)
            final_text = ''
            for text in text_list:
                final_text += f'{text}\n'
    return final_text
    def create_marc_seq_file(self):
        """
        function to transform a MARC formatted Dataframe into a MARC sequantial file

        """
        logger = logging.getLogger(__name__)
        logger.info(
            f"[MARC Sequantial] Creating MARC sequantial file for {self.collection_id}"
        )

        df = self.marc_data
        ad = AlphabetDetector()
        output_file_name = self.data_path_processed / (
            self.collection_id + "_final_" + self.dt_now + ".txt")

        with open(output_file_name, "w", encoding="utf8") as f:
            for index, row in df.iterrows():

                f.write(f"{index} LDR   {row['LDR']}\n")
                f.write(f"{index} 001   {index}\n")

                for col in df:
                    # if field is empty, skip
                    if str(row[col]) == "" or col == "LDR":
                        continue

                    # # check language
                    # lang = ad.detect_alphabet(str(row[col]))
                    # if "HEBREW" in lang:
                    #     lang = "H"
                    # else:
                    #     lang = "L"

                    # construct 5 character field code
                    if "_" in col:
                        col_name = "{:<5}".format(col[:col.find("_")])
                    else:
                        col_name = "{:<5}".format(col)

                    # construct the line for the MARC sequantial file
                    line = f"{index} {col_name} {str(row[col])}\n"
                    # if col_name == '035':
                    line = line.replace("$$$$", "$$")
                    line = line.replace("$$a$$a", "$$")

                    # write to file
                    f.write(line)