def detect_alphabet(str): ad = AlphabetDetector() uni_string = unicode(str, "utf-8") ab = ad.detect_alphabet(uni_string) if "CYRILLIC" in ab: return "CYRILLIC" return ab.pop() if len(ab) != 0 else 'UND'
def extractor(): alpha_det = AlphabetDetector() url = request.args.get('url') if not url: return render_template('no_url.html') url = url.strip() # title = extract_util.get_title(url) title, newspaper3k_text = util.extract_newspaper3k(url) if 'ARABIC' in alpha_det.detect_alphabet(title): text_dir = 'rtl' lang = 'Arabic' else: text_dir = 'ltr' lang = 'English' date = util.get_date(url) text_justext = util.get_text_justext(url, lang) news_please_text = util.extract_news_please(url) # _, bs4_text = extract_util.get_title_text_BS4(url) text_pextract = util.get_pextract(url) texts = OrderedDict() texts['Justext'] = text_justext texts['Newspaper3k'] = newspaper3k_text texts['NewsPlease'] = news_please_text texts['pextract'] = text_pextract return render_template('article_info.html', url=url, title=title, date=date, text_dir=text_dir, texts=texts)
def do_split_marker_by_script(sfm,find_marker,script1,script2,new_marker1,new_marker2): #Find a given marker and split it by script. ad = AlphabetDetector() new_sfm = sfm count = 0 logging.info("\nIn do_split_marker_by_script code:\n") for i,entry in enumerate(new_sfm): for j , field in enumerate(entry): marker, data = field if marker == find_marker : count = count + 1 scripts = ad.detect_alphabet(data) script_count = len(scripts) if script_count == 1: script = next(iter(scripts)) new_field = [marker + '_' + script,data] new_sfm[i].insert(j+1,new_field) #logging.info("\nFound '{}' only containing {}. Adding new field: {}".format(data,script,new_field)) #logging.info(new_sfm[i]) elif script_count > 1: print("\nFound {} scripts: {}".format(len(scripts),scripts)) print("Data is {}".format(data)) for script_number,script in enumerate(scripts): string_list = [character for character in data if character == space or script in ad.detect_alphabet(character)] string = ''.join(string_list).strip() new_field = [marker + '_' + script,string] print("New_field is {}".format(new_field)) new_sfm[i].insert(j+script_number+1,new_field) print(new_sfm[i]) return new_sfm, count
def load_json_newsletters(corpus_dir): alphabet_detector = AlphabetDetector() arb_corpus = list() eng_corpus = list() ids = list() json_files = glob.glob(corpus_dir + '/*.json') print('# of newsletters:', len(json_files)) for json_file in json_files: json_doc = json.loads(open(json_file).read()) try: j_articles = json_doc['articles'] # print('# of articles:', len(j_articles)) for e in j_articles: doc_id = j_articles[e]['id'] title = clean_text(j_articles[e]['title']) text = clean_text(j_articles[e]['body']) # print(text) link = j_articles[e]['link'] if text and 'ARABIC' in alphabet_detector.detect_alphabet(text): arb_corpus.append(text) else: eng_corpus.append(text) except KeyError: continue print('# of Arabic documents:', len(arb_corpus)) print('# of English documents:', len(eng_corpus)) return arb_corpus, eng_corpus
def detect_alphabet(lstr): ad = AlphabetDetector() lalphabets = [] for l in lstr: ab = ad.detect_alphabet(l) if "CYRILLIC" in ab: lalphabets.append("CYRILLIC") else: lalphabets.append(ab.pop() if len(ab) != 0 else 'UND') return lalphabets
def getDescription(self, photoInfo): from alphabet_detector import AlphabetDetector ad = AlphabetDetector() text = u'' descriptionString = photoInfo['photo']['description']['_content'] for line in descriptionString.splitlines(): if not 'CYRILLIC' in ad.detect_alphabet(line): text += "\n" + line return text
def make_mt_data_from_master(bigrams=False): ad = AlphabetDetector() source = codecs.open("data/mt/source.txt", "w", encoding="utf-8") target = codecs.open("data/mt/target.txt", "w", encoding="utf-8") source_valid = codecs.open("data/mt/source_valid.txt", "w", encoding="utf-8") target_valid = codecs.open("data/mt/target_valid.txt", "w", encoding="utf-8") mapped_titles = json.load( codecs.open("data/master-generated-titles-filtered.json", "r", encoding="utf-8")) keys = list(mapped_titles.keys()) random.shuffle(keys) i = 0 for key in keys: orig = clean_text(key.replace(".", "")) humorous_ones = mapped_titles[key] if "output" not in humorous_ones: continue for humorous in humorous_ones["output"]: i += 1 if "ARABIC" in ad.detect_alphabet(humorous): #skip the ones with arabic characters continue humorous = clean_text(humorous.replace(" .", "")) if not bigrams: target.write(humorous + "\n") source.write(orig + "\n") if i % 4 == 0: target_valid.write(humorous + "\n") source_valid.write(orig + "\n") if bigrams: source_grams, target_grams = __make_bigram_lists( orig, humorous) for x in range(len(source_grams)): source_gram = " ".join(source_grams[x]) target_gram = " ".join(target_grams[x]) source.write(source_gram + "\n") target.write(target_gram + "\n") if i % 10 == 0: source_valid.write(source_gram + "\n") target_valid.write(target_gram + "\n") source.close() target.close() source_valid.close() target_valid.close()
def get_words_cases(self, words: Sequence[str]) -> List[str]: pluralized_words = [] alphabet_detector = AlphabetDetector() if isinstance(words, str): words = [words] for word in words: alphabets = alphabet_detector.detect_alphabet(word) if 'LATIN' in alphabets: pluralized_words.append(inflection.pluralize(word)) elif 'CYRILLIC' in alphabets: pluralized_words += self.get_cyrillic_word_cases(word) else: self.logger.warn( 'Unsupported language for text: {}'.format(word)) return pluralized_words
def return_split_text_by_characterencode(orig_sentence): ad = AlphabetDetector() character_coding_list = ad.detect_alphabet(orig_sentence) for character_coding in character_coding_list: print(character_coding) if 'HIRAGANA' in character_coding or 'KATAKANA' in character_coding or 'CJK' in character_coding: text_list = cut_text(f'{orig_sentence}', 43) final_text = '' for text in text_list: final_text += f'{text}\n' else: text_list = cut_text(f'{orig_sentence}', 88) final_text = '' for text in text_list: final_text += f'{text}\n' return final_text
def detect_language(s): try: replace_list = ['–', '•'] for x in replace_list: if x in s: s = s.replace(x, '') s.encode('ascii') except UnicodeEncodeError: ad = AlphabetDetector() lang = ad.detect_alphabet(s) if not ('ARABIC' in lang): return 'other' return 'fa' else: return 'en'
def make_mt_data(bigrams=False): ad = AlphabetDetector() source = codecs.open("data/mt/source.txt", "w", encoding="utf-8") target = codecs.open("data/mt/target.txt", "w", encoding="utf-8") source_valid = codecs.open("data/mt/source_valid.txt", "w", encoding="utf-8") target_valid = codecs.open("data/mt/target_valid.txt", "w", encoding="utf-8") mapped_titles = json.load( codecs.open("data/mapped_titles.json", "r", encoding="utf-8")) datas = list(mapped_titles.values()) random.shuffle(datas) i = 0 for data in datas: orig = data["tweet_clean"] humorous = data["matched_title"] if orig == humorous: continue if "ARABIC" in ad.detect_alphabet(orig): #skip the ones with arabic characters continue i += 1 if not bigrams: source.write(humorous + "\n") target.write(orig + "\n") if i % 4 == 0: source_valid.write(humorous + "\n") target_valid.write(orig + "\n") if bigrams: source_grams, target_grams = __make_bigram_lists(humorous, orig) for x in range(len(source_grams)): source_gram = " ".join(source_grams[x]) target_gram = " ".join(target_grams[x]) source.write(source_gram + "\n") target.write(target_gram + "\n") if i % 5 == 0: source_valid.write(source_gram + "\n") target_valid.write(target_gram + "\n") source.close() target.close() source_valid.close() target_valid.close()
def prepare_title(title): """Replaces non-alphanums from a paper title, allowing foreign characters and cleans up multiple spaces and trailing spaces. Args: title (str): the title of the paper Returns: (str): cleaned title """ detector = AlphabetDetector() if title is None: return "" result = "".join([ x if len(detector.detect_alphabet(x)) > 0 or x.isnumeric() else " " for x in title.lower() ]) # Recursively remove spaces while " " in result: result = result.replace(" ", " ") # Remove trailing spaces if result[-1] == " ": result = result[0:-1] return result
def sameAlphabet(self,vLine): ad = AlphabetDetector() if len (ad.detect_alphabet(vLine.decode('utf-8'))) <= 1: return True else: return False
def mesh_dataset(filename_in, size_in): end_list = [] ad = AlphabetDetector() dict_lists = {} match_max_len = 0 match_min_len = 0 match_total_len = 0 match_total_rows = 0 nomatch_max_len = 0 nomatch_min_len = 0 nomatch_total_len = 0 nomatch_total_rows = 0 l_index = 0 with open("dataset_match_" + filename_in + ".csv", encoding="utf8") as csvfile1: reader = csv.DictReader(csvfile1, fieldnames=["ID1", "NAME1", "NAME2"], delimiter='|') for row in reader: l_index = l_index + 1 alpha = [] alpha_1 = ad.detect_alphabet(row["NAME1"].strip()) alpha_2 = ad.detect_alphabet(row["NAME2"].strip()) for item in alpha_1: if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item: aux_item = 'CJK' else: aux_item = item if not aux_item in alpha: alpha.append(aux_item) for item in alpha_2: if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item: aux_item = 'CJK' else: aux_item = item if not aux_item in alpha: alpha.append(aux_item) len_1 = len(row["NAME1"]) len_2 = len(row["NAME2"]) if len_1 > match_max_len or match_max_len == 0: match_max_len = len_1 if len_2 > match_max_len: match_max_len = len_2 if len_1 < match_min_len or match_min_len == 0: match_min_len = len_1 if len_2 < match_min_len: match_min_len = len_2 match_total_rows += 2 match_total_len += len_1 + len_2 alpha_line = "" for item in alpha: alpha_line = alpha_line + item + ";" alpha_line = alpha_line.strip(";") if alpha_line.count(";") > 0 and 'LATIN' in alpha_line: alpha_line_aux = "MIXED WITH LATIN" elif alpha_line.count(";") > 0: alpha_line_aux = "MIXED WITHOUT LATIN" else: alpha_line_aux = alpha_line if not alpha_line_aux in dict_lists: dict_lists[alpha_line_aux] = 1 else: dict_lists[alpha_line_aux] += 1 line = row["NAME1"].strip() + "|" + row["NAME2"].strip( ) + "|1|" + alpha_line end_list.append(line) if l_index % (size_in / 2) == 0: break with open("dataset_non_match_" + filename_in + ".csv", encoding="utf8") as csvfile2: reader = csv.DictReader(csvfile2, fieldnames=["ID1", "NAME1", "ID2", "NAME2"], delimiter='|') for row in reader: l_index = l_index + 1 alpha = [] alpha_1 = ad.detect_alphabet(row["NAME1"].strip()) alpha_2 = ad.detect_alphabet(row["NAME2"].strip()) for item in alpha_1: if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item: aux_item = 'CJK' else: aux_item = item if not aux_item in alpha: alpha.append(aux_item) for item in alpha_2: if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item: aux_item = 'CJK' else: aux_item = item if not aux_item in alpha: alpha.append(aux_item) len_1 = len(row["NAME1"]) len_2 = len(row["NAME2"]) if len_1 > nomatch_max_len or nomatch_max_len == 0: nomatch_max_len = len_1 if len_2 > nomatch_max_len: nomatch_max_len = len_2 if len_1 < nomatch_min_len or nomatch_min_len == 0: nomatch_min_len = len_1 if len_2 < nomatch_min_len: nomatch_min_len = len_2 nomatch_total_rows += 1 nomatch_total_len = nomatch_total_len + len_1 + len_2 alpha_line = "" for item in alpha: alpha_line = alpha_line + item + ";" alpha_line = alpha_line.strip(";") if alpha_line.count(";") > 0 and 'LATIN' in alpha_line: alpha_line_aux = "MIXED WITH LATIN" elif alpha_line.count(";") > 0: alpha_line_aux = "MIXED WITHOUT LATIN" else: alpha_line_aux = alpha_line if not alpha_line_aux in dict_lists: dict_lists[alpha_line_aux] = 1 else: dict_lists[alpha_line_aux] += 1 line = row["NAME1"].strip() + "|" + row["NAME2"].strip( ) + "|0|" + alpha_line end_list.append(line) if l_index % size_in == 0: break shuffle(end_list) with open("dataset_final_" + filename_in + ".csv", 'w', encoding="utf-8") as inW1: for row in end_list: inW1.write(row.strip() + '\n') with open("Report_DataSet_" + filename_in + ".txt", 'w', encoding="utf-8") as inW2: inW2.write(filename_in.upper() + '\n') inW2.write('\n') inW2.write("Total Pairs: " + str((match_total_rows + nomatch_total_rows) / 2) + '\n') inW2.write("Matching Pairs: " + str(match_total_rows / 2) + '\n') inW2.write("Non Matching Pairs: " + str(nomatch_total_rows / 2) + '\n') for key, value in sorted(dict_lists.items(), key=lambda x: x[1], reverse=True): inW2.write(key + ": " + str(value) + "\n") inW2.write("\n") inW2.write("Matching INFO:" + "\n") inW2.write("Max Length Matching:" + str(match_max_len) + "\n") inW2.write("Min Length Matching:" + str(match_min_len) + "\n") inW2.write("Avg Matching:" + str(round(match_total_len / match_total_rows, 3)) + "\n") inW2.write("Total Max LEN:" + str(match_total_len) + "\n") inW2.write("Total Max ROWS:" + str(match_total_rows) + "\n") inW2.write("\n") inW2.write("Non Matching INFO:" + "\n") inW2.write("Max Length Non Matching:" + str(nomatch_max_len) + "\n") inW2.write("Min Length Non Matching:" + str(nomatch_min_len) + "\n") inW2.write("Avg Non Matching:" + str(round(nomatch_total_len / nomatch_total_rows, 3)) + "\n") inW2.write("\n") inW2.write("Dataset INFO:" + "\n") inW2.write("Max Length Dataset:" + str(max(match_max_len, nomatch_max_len)) + "\n") inW2.write("Min Length Dataset:" + str(min(match_min_len, nomatch_min_len)) + "\n") inW2.write("Avg Dataset:" + str( round((match_total_len + nomatch_total_len) / (match_total_rows + nomatch_total_rows), 3)) + "\n")