def detect_alphabet(str):
    ad = AlphabetDetector()
    uni_string = unicode(str, "utf-8")
    ab = ad.detect_alphabet(uni_string)
    if "CYRILLIC" in ab:
        return "CYRILLIC"
    return ab.pop() if len(ab) != 0 else 'UND'
def extractor():
    alpha_det = AlphabetDetector()
    url = request.args.get('url')
    if not url:
        return render_template('no_url.html')
    url = url.strip()
    # title = extract_util.get_title(url)
    title, newspaper3k_text = util.extract_newspaper3k(url)

    if 'ARABIC' in alpha_det.detect_alphabet(title):
        text_dir = 'rtl'
        lang = 'Arabic'
    else:
        text_dir = 'ltr'
        lang = 'English'

    date = util.get_date(url)
    text_justext = util.get_text_justext(url, lang)
    news_please_text = util.extract_news_please(url)
    # _, bs4_text = extract_util.get_title_text_BS4(url)
    text_pextract = util.get_pextract(url)

    texts = OrderedDict()
    texts['Justext'] = text_justext
    texts['Newspaper3k'] = newspaper3k_text
    texts['NewsPlease'] = news_please_text
    texts['pextract'] = text_pextract

    return render_template('article_info.html',
                           url=url,
                           title=title,
                           date=date,
                           text_dir=text_dir,
                           texts=texts)
Beispiel #3
0
def do_split_marker_by_script(sfm,find_marker,script1,script2,new_marker1,new_marker2):
#Find a given marker and split it by script.

	ad = AlphabetDetector()
	new_sfm = sfm
	count = 0
	logging.info("\nIn do_split_marker_by_script code:\n")
	
	for i,entry in enumerate(new_sfm):
		for j , field in enumerate(entry):
			marker, data = field
			if marker == find_marker :
				count = count + 1
				scripts = ad.detect_alphabet(data)
				script_count = len(scripts)
				
				if script_count == 1:
					script = next(iter(scripts))
					new_field = [marker + '_' + script,data]
					new_sfm[i].insert(j+1,new_field)
					#logging.info("\nFound '{}' only containing {}. Adding new field: {}".format(data,script,new_field))
					#logging.info(new_sfm[i])
					
				elif script_count > 1:
					print("\nFound {} scripts: {}".format(len(scripts),scripts))
					print("Data is {}".format(data))
					for script_number,script in enumerate(scripts):
						string_list = [character for character in data if character == space or script in ad.detect_alphabet(character)]
						string = ''.join(string_list).strip()
						new_field = [marker + '_' + script,string]
						print("New_field is {}".format(new_field))
						new_sfm[i].insert(j+script_number+1,new_field)
					print(new_sfm[i])
	return new_sfm, count
Beispiel #4
0
def load_json_newsletters(corpus_dir):
    alphabet_detector = AlphabetDetector()
    arb_corpus = list()
    eng_corpus = list()
    ids = list()
    json_files = glob.glob(corpus_dir + '/*.json')
    print('# of newsletters:', len(json_files))
    for json_file in json_files:
        json_doc = json.loads(open(json_file).read())
        try:
            j_articles = json_doc['articles']
            # print('# of articles:', len(j_articles))
            for e in j_articles:
                doc_id = j_articles[e]['id']
                title = clean_text(j_articles[e]['title'])
                text = clean_text(j_articles[e]['body'])
                # print(text)
                link = j_articles[e]['link']
                if text and 'ARABIC' in alphabet_detector.detect_alphabet(text):
                    arb_corpus.append(text)
                else:
                    eng_corpus.append(text)
        except KeyError:
            continue

    print('# of Arabic documents:', len(arb_corpus))
    print('# of English documents:', len(eng_corpus))
    return arb_corpus, eng_corpus
Beispiel #5
0
def detect_alphabet(lstr):
    ad = AlphabetDetector()
    lalphabets = []
    for l in lstr:
        ab = ad.detect_alphabet(l)
        if "CYRILLIC" in ab:
            lalphabets.append("CYRILLIC")
        else:
            lalphabets.append(ab.pop() if len(ab) != 0 else 'UND')
    return lalphabets
    def getDescription(self, photoInfo):

        from alphabet_detector import AlphabetDetector
        ad = AlphabetDetector()

        text = u''

        descriptionString = photoInfo['photo']['description']['_content']
        for line in descriptionString.splitlines():
            if not 'CYRILLIC' in ad.detect_alphabet(line):
                text += "\n" + line

        return text
Beispiel #7
0
def make_mt_data_from_master(bigrams=False):
    ad = AlphabetDetector()
    source = codecs.open("data/mt/source.txt", "w", encoding="utf-8")
    target = codecs.open("data/mt/target.txt", "w", encoding="utf-8")
    source_valid = codecs.open("data/mt/source_valid.txt",
                               "w",
                               encoding="utf-8")
    target_valid = codecs.open("data/mt/target_valid.txt",
                               "w",
                               encoding="utf-8")
    mapped_titles = json.load(
        codecs.open("data/master-generated-titles-filtered.json",
                    "r",
                    encoding="utf-8"))
    keys = list(mapped_titles.keys())
    random.shuffle(keys)
    i = 0
    for key in keys:
        orig = clean_text(key.replace(".", ""))
        humorous_ones = mapped_titles[key]
        if "output" not in humorous_ones:
            continue
        for humorous in humorous_ones["output"]:
            i += 1
            if "ARABIC" in ad.detect_alphabet(humorous):
                #skip the ones with arabic characters
                continue
            humorous = clean_text(humorous.replace(" .", ""))
            if not bigrams:
                target.write(humorous + "\n")
                source.write(orig + "\n")
                if i % 4 == 0:
                    target_valid.write(humorous + "\n")
                    source_valid.write(orig + "\n")
            if bigrams:
                source_grams, target_grams = __make_bigram_lists(
                    orig, humorous)
                for x in range(len(source_grams)):
                    source_gram = " ".join(source_grams[x])
                    target_gram = " ".join(target_grams[x])
                    source.write(source_gram + "\n")
                    target.write(target_gram + "\n")
                    if i % 10 == 0:
                        source_valid.write(source_gram + "\n")
                        target_valid.write(target_gram + "\n")
    source.close()
    target.close()
    source_valid.close()
    target_valid.close()
Beispiel #8
0
 def get_words_cases(self, words: Sequence[str]) -> List[str]:
     pluralized_words = []
     alphabet_detector = AlphabetDetector()
     if isinstance(words, str):
         words = [words]
     for word in words:
         alphabets = alphabet_detector.detect_alphabet(word)
         if 'LATIN' in alphabets:
             pluralized_words.append(inflection.pluralize(word))
         elif 'CYRILLIC' in alphabets:
             pluralized_words += self.get_cyrillic_word_cases(word)
         else:
             self.logger.warn(
                 'Unsupported language for text: {}'.format(word))
     return pluralized_words
def return_split_text_by_characterencode(orig_sentence):
    ad = AlphabetDetector()
    character_coding_list = ad.detect_alphabet(orig_sentence)
    for character_coding in character_coding_list:
        print(character_coding)
        if 'HIRAGANA' in character_coding or 'KATAKANA' in character_coding or 'CJK' in character_coding:
            text_list = cut_text(f'{orig_sentence}', 43)
            final_text = ''
            for text in text_list:
                final_text += f'{text}\n'
        else:
            text_list = cut_text(f'{orig_sentence}', 88)
            final_text = ''
            for text in text_list:
                final_text += f'{text}\n'
    return final_text
Beispiel #10
0
def detect_language(s):
    try:
        replace_list = ['–', '•']
        for x in replace_list:
            if x in s:
                s = s.replace(x, '')
        s.encode('ascii')

    except UnicodeEncodeError:
        ad = AlphabetDetector()
        lang = ad.detect_alphabet(s)
        if not ('ARABIC' in lang):
            return 'other'
        return 'fa'
    else:
        return 'en'
Beispiel #11
0
def make_mt_data(bigrams=False):
    ad = AlphabetDetector()
    source = codecs.open("data/mt/source.txt", "w", encoding="utf-8")
    target = codecs.open("data/mt/target.txt", "w", encoding="utf-8")
    source_valid = codecs.open("data/mt/source_valid.txt",
                               "w",
                               encoding="utf-8")
    target_valid = codecs.open("data/mt/target_valid.txt",
                               "w",
                               encoding="utf-8")
    mapped_titles = json.load(
        codecs.open("data/mapped_titles.json", "r", encoding="utf-8"))
    datas = list(mapped_titles.values())
    random.shuffle(datas)
    i = 0
    for data in datas:
        orig = data["tweet_clean"]
        humorous = data["matched_title"]
        if orig == humorous:
            continue
        if "ARABIC" in ad.detect_alphabet(orig):
            #skip the ones with arabic characters
            continue
        i += 1
        if not bigrams:
            source.write(humorous + "\n")
            target.write(orig + "\n")
            if i % 4 == 0:
                source_valid.write(humorous + "\n")
                target_valid.write(orig + "\n")
        if bigrams:
            source_grams, target_grams = __make_bigram_lists(humorous, orig)
            for x in range(len(source_grams)):
                source_gram = " ".join(source_grams[x])
                target_gram = " ".join(target_grams[x])
                source.write(source_gram + "\n")
                target.write(target_gram + "\n")
                if i % 5 == 0:
                    source_valid.write(source_gram + "\n")
                    target_valid.write(target_gram + "\n")
    source.close()
    target.close()
    source_valid.close()
    target_valid.close()
Beispiel #12
0
def prepare_title(title):
    """Replaces non-alphanums from a paper title, allowing foreign characters and cleans
    up multiple spaces and trailing spaces.

    Args:
        title (str): the title of the paper

    Returns:
        (str): cleaned title
    """
    detector = AlphabetDetector()
    if title is None:
        return ""
    result = "".join([
        x if len(detector.detect_alphabet(x)) > 0 or x.isnumeric() else " "
        for x in title.lower()
    ])
    # Recursively remove spaces
    while "  " in result:
        result = result.replace("  ", " ")
    # Remove trailing spaces
    if result[-1] == " ":
        result = result[0:-1]
    return result
Beispiel #13
0
 def sameAlphabet(self,vLine):
   ad = AlphabetDetector()
   if len (ad.detect_alphabet(vLine.decode('utf-8'))) <= 1:
     return True
   else:
     return False
Beispiel #14
0
def mesh_dataset(filename_in, size_in):
    end_list = []
    ad = AlphabetDetector()
    dict_lists = {}
    match_max_len = 0
    match_min_len = 0
    match_total_len = 0
    match_total_rows = 0
    nomatch_max_len = 0
    nomatch_min_len = 0
    nomatch_total_len = 0
    nomatch_total_rows = 0
    l_index = 0

    with open("dataset_match_" + filename_in + ".csv",
              encoding="utf8") as csvfile1:
        reader = csv.DictReader(csvfile1,
                                fieldnames=["ID1", "NAME1", "NAME2"],
                                delimiter='|')
        for row in reader:
            l_index = l_index + 1
            alpha = []
            alpha_1 = ad.detect_alphabet(row["NAME1"].strip())
            alpha_2 = ad.detect_alphabet(row["NAME2"].strip())
            for item in alpha_1:
                if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item:
                    aux_item = 'CJK'
                else:
                    aux_item = item
                if not aux_item in alpha:
                    alpha.append(aux_item)
            for item in alpha_2:
                if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item:
                    aux_item = 'CJK'
                else:
                    aux_item = item
                if not aux_item in alpha:
                    alpha.append(aux_item)

            len_1 = len(row["NAME1"])
            len_2 = len(row["NAME2"])
            if len_1 > match_max_len or match_max_len == 0:
                match_max_len = len_1
            if len_2 > match_max_len:
                match_max_len = len_2
            if len_1 < match_min_len or match_min_len == 0:
                match_min_len = len_1
            if len_2 < match_min_len:
                match_min_len = len_2
            match_total_rows += 2
            match_total_len += len_1 + len_2

            alpha_line = ""
            for item in alpha:
                alpha_line = alpha_line + item + ";"
            alpha_line = alpha_line.strip(";")
            if alpha_line.count(";") > 0 and 'LATIN' in alpha_line:
                alpha_line_aux = "MIXED WITH LATIN"
            elif alpha_line.count(";") > 0:
                alpha_line_aux = "MIXED WITHOUT LATIN"
            else:
                alpha_line_aux = alpha_line

            if not alpha_line_aux in dict_lists:
                dict_lists[alpha_line_aux] = 1
            else:
                dict_lists[alpha_line_aux] += 1
            line = row["NAME1"].strip() + "|" + row["NAME2"].strip(
            ) + "|1|" + alpha_line
            end_list.append(line)
            if l_index % (size_in / 2) == 0:
                break

    with open("dataset_non_match_" + filename_in + ".csv",
              encoding="utf8") as csvfile2:
        reader = csv.DictReader(csvfile2,
                                fieldnames=["ID1", "NAME1", "ID2", "NAME2"],
                                delimiter='|')
        for row in reader:
            l_index = l_index + 1
            alpha = []
            alpha_1 = ad.detect_alphabet(row["NAME1"].strip())
            alpha_2 = ad.detect_alphabet(row["NAME2"].strip())
            for item in alpha_1:
                if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item:
                    aux_item = 'CJK'
                else:
                    aux_item = item
                if not aux_item in alpha:
                    alpha.append(aux_item)
            for item in alpha_2:
                if 'KATAKANA' in item or 'HANGUL' in item or 'HIRAGANA' in item:
                    aux_item = 'CJK'
                else:
                    aux_item = item
                if not aux_item in alpha:
                    alpha.append(aux_item)

            len_1 = len(row["NAME1"])
            len_2 = len(row["NAME2"])
            if len_1 > nomatch_max_len or nomatch_max_len == 0:
                nomatch_max_len = len_1
            if len_2 > nomatch_max_len:
                nomatch_max_len = len_2
            if len_1 < nomatch_min_len or nomatch_min_len == 0:
                nomatch_min_len = len_1
            if len_2 < nomatch_min_len:
                nomatch_min_len = len_2
            nomatch_total_rows += 1
            nomatch_total_len = nomatch_total_len + len_1 + len_2

            alpha_line = ""
            for item in alpha:
                alpha_line = alpha_line + item + ";"
            alpha_line = alpha_line.strip(";")
            if alpha_line.count(";") > 0 and 'LATIN' in alpha_line:
                alpha_line_aux = "MIXED WITH LATIN"
            elif alpha_line.count(";") > 0:
                alpha_line_aux = "MIXED WITHOUT LATIN"
            else:
                alpha_line_aux = alpha_line

            if not alpha_line_aux in dict_lists:
                dict_lists[alpha_line_aux] = 1
            else:
                dict_lists[alpha_line_aux] += 1
            line = row["NAME1"].strip() + "|" + row["NAME2"].strip(
            ) + "|0|" + alpha_line
            end_list.append(line)
            if l_index % size_in == 0:
                break

    shuffle(end_list)

    with open("dataset_final_" + filename_in + ".csv", 'w',
              encoding="utf-8") as inW1:
        for row in end_list:
            inW1.write(row.strip() + '\n')

    with open("Report_DataSet_" + filename_in + ".txt", 'w',
              encoding="utf-8") as inW2:
        inW2.write(filename_in.upper() + '\n')
        inW2.write('\n')
        inW2.write("Total Pairs: " +
                   str((match_total_rows + nomatch_total_rows) / 2) + '\n')
        inW2.write("Matching Pairs: " + str(match_total_rows / 2) + '\n')
        inW2.write("Non Matching Pairs: " + str(nomatch_total_rows / 2) + '\n')
        for key, value in sorted(dict_lists.items(),
                                 key=lambda x: x[1],
                                 reverse=True):
            inW2.write(key + ": " + str(value) + "\n")

        inW2.write("\n")
        inW2.write("Matching INFO:" + "\n")
        inW2.write("Max Length Matching:" + str(match_max_len) + "\n")
        inW2.write("Min Length Matching:" + str(match_min_len) + "\n")
        inW2.write("Avg Matching:" +
                   str(round(match_total_len / match_total_rows, 3)) + "\n")
        inW2.write("Total Max LEN:" + str(match_total_len) + "\n")
        inW2.write("Total Max ROWS:" + str(match_total_rows) + "\n")
        inW2.write("\n")
        inW2.write("Non Matching INFO:" + "\n")
        inW2.write("Max Length Non Matching:" + str(nomatch_max_len) + "\n")
        inW2.write("Min Length Non Matching:" + str(nomatch_min_len) + "\n")
        inW2.write("Avg Non Matching:" +
                   str(round(nomatch_total_len / nomatch_total_rows, 3)) +
                   "\n")
        inW2.write("\n")
        inW2.write("Dataset INFO:" + "\n")
        inW2.write("Max Length Dataset:" +
                   str(max(match_max_len, nomatch_max_len)) + "\n")
        inW2.write("Min Length Dataset:" +
                   str(min(match_min_len, nomatch_min_len)) + "\n")
        inW2.write("Avg Dataset:" + str(
            round((match_total_len + nomatch_total_len) /
                  (match_total_rows + nomatch_total_rows), 3)) + "\n")