Exemple #1
0
def find_pinyin(word):

    tones = None

    characters = list(word)

    i = 0
    has_english = False
    while not has_english and i < len(characters):
        has_english = not hanzidentifier.has_chinese(characters[i])
        i += 1

    if not has_english:
        ret = p.get_pinyin(word, tone_marks="numbers")

        words = ret.split("-")
        tones = []
        for j in range(len(words)):
            tones.append(words[j][-1])

        tones = tone_change(tones)

        tones = "".join(tones)

    return tones
Exemple #2
0
def is_entirely_chinese(s):
    """
    Check if every character in the string is Chinese.

    *s* string to search in
    """
    debug('TEST(IEC): '+str(s))
    for c in s:
        if not ziid.has_chinese(c):
            return False
            debug('TEST(IEC): '+c+' is not chinese.')
    return True
Exemple #3
0
def is_entirely_chinese(s):
    """
    Check if every character in the string is Chinese.

    *s* string to search in
    """
    debug('TEST(IEC): '+str(s))
    for c in s:
        if not ziid.has_chinese(c):
            debug('TEST(IEC): '+c+' is not chinese.')
            return False
    return True
Exemple #4
0
 def is_chinese(self,text):
    ret = []
    if hanzidentifier.has_chinese(text): #chinese
      chineseArray = re.findall(ur'[\u4e00-\u9fff]+',text)
      chineseChars = len(str(chineseArray)) - (2+len(chineseArray))
      if len(text) / 3 < chineseChars: #At least 1/3 of the sentence in Chinese characteres
        if hanzidentifier.identify(text) is hanzidentifier.SIMPLIFIED:
          ret = [[1,'ZH-CHS']]
        elif hanzidentifier.identify(text) is hanzidentifier.TRADITIONAL:
          ret = [[1,'ZH-CHT']]
        elif hanzidentifier.identify(text) is hanzidentifier.BOTH or  hanzidentifier.identify(text) is hanzidentifier.MIXED:
          ret = [[1,'ZH-CHT'],[1,'ZH-CHS']]
    return ret
Exemple #5
0
def get_indices(keyword, dataframe):
    """
    Retrieves document indexes from the dataframe based on the keyword. Returns a list of integers that correspond to the tweet or Weibo post
    index in the corresponding corpus.
    
    Arguments:
    keyword -- (str) a string that is either English or Chinese.
    dataframe -- (pd.DataFrame) a corpus of social media posts from Twitter or Weibo.
    """
    indices = []
    if hanzidentifier.has_chinese(keyword):
        indices.extend(
            dataframe[dataframe['text'].str.contains(keyword)].index)
    else:
        word_regex = r"[a-z]+"
        match = re.match(word_regex, keyword)
        if match:
            indices.extend(dataframe[dataframe['joined_lems'].str.contains(
                pat=fr'\b{keyword}\b', regex=True, case=False)].index)
    return indices
Exemple #6
0
def create_wc(word, indices_list, filename, dataframe):
    """
    Generate and return a word cloud based on the keyword.

    Arguments:
    word -- (str) a string that is either English or Chinese.
    indices_list -- (list) list of integers corresponding to the index of a dataframe.
    filename -- (str)
    dataframe -- (pd.DataFrame) a corpus of social media posts from Twitter or Weibo.
    """
    all_posts = []
    response_weibo = requests.get(weibo_mask_url)
    response_twitter = requests.get(twitter_mask_url)
    background_cn = np.array(Image.open(BytesIO(response_weibo.content)))
    background_en = np.array(Image.open(BytesIO(response_twitter.content)))

    if hanzidentifier.has_chinese(word):
        font_path = 'SourceHanSansSC-Regular.otf'  #need to download the font file before loading
        background = background_cn
        stopwords = chinese_stopwords
    else:
        font_path = None
        background = background_en
        stopwords = english_stopwords

    for post_index in indices_list:
        all_posts.append(dataframe.loc[dataframe.index == post_index,
                                       "text"].iloc[0])

    wordcloud = WordCloud(background_color='white',
                          max_words=100,
                          width=200,
                          height=100,
                          font_path='SourceHanSansSC-Regular.otf',
                          mask=background,
                          stopwords=stopwords).generate(" ".join(all_posts))

    plt.imshow(wordcloud)
    plt.axis("off")
    plt.figure(figsize=(1000, 500))
    wordcloud.to_file(filename)
Exemple #7
0
final = [[[], []] for y in range(len(dat))]
i = 0
while i < len(dat):
    print(i)
    dict = {}
    a = eval(dat[i][0])
    b = eval(dat[i][1])
    for k in b:
        if b[k] not in a:
            continue
        if b[k] in a:
            dict[k] = a[b[k]]
    #print(dict)
    for k in dict:
        if hanzidentifier.has_chinese(k) and len(k) > 1:
            for j in k:
                if j != "" and j != " ":
                    final[i][0].append(j)
                    if dict[k] == "B-entity" and j == k[0]:
                        final[i][1].append(dict[k])
                        continue
                    if dict[k] == "B-entity" and j != k[0]:
                        final[i][1].append("I-entity")
                        continue
                    if dict[k] == "B-action" and j == k[0]:
                        final[i][1].append(dict[k])
                        continue
                    if dict[k] == "B-action" and j != k[0]:
                        final[i][1].append("I-action")
                        continue
Exemple #8
0
    def do_GET(self):
        self.send_response(200)
        query = parse.urlsplit(self.path).query
        query_dict = parse.parse_qs(query)
        if "keyword_search_frontend.css" in self.path:
            self.send_header('Content-type', 'text/css; charset=utf-8')
            self.end_headers()
            f = open("keyword_search_frontend.css", encoding="utf-8")
            html = f.read()
            f.close()
            self.wfile.write(html.encode("utf-8"))
        elif "keyword_search_frontend.js" in self.path:
            self.send_header('Content-type', 'text/javascript; charset=utf-8')
            self.end_headers()
            f = open("keyword_search_frontend.js", encoding="utf-8")
            html = f.read()
            f.close()
            self.wfile.write(html.encode("utf-8"))
        elif self.path == "/":
            self.send_header('Content-type', 'text/html; charset=utf-8')
            self.end_headers()
            f = open("keyword_search_frontend.html", encoding="utf-8")
            html = f.read()
            f.close()
            self.wfile.write(html.encode("utf-8"))
        elif "png" in self.path:
            self.send_header('Content-type', 'image/png;')
            self.end_headers()
            wc = open("word_cloud.png", "rb")
            self.wfile.write(wc.read())
            wc.close()

        else:

            self.send_header('Content-type', 'text/html; charset=utf-8')
            self.end_headers()

            print(query_dict)
            if query_dict['type'][0] == 'category':
                language = query_dict["lang"][0]
                selected_category = query_dict["categories"][0]
                print(language, selected_category)
                results = dropdown_menu(selected_category, language)
                posts_table = put_in_table(results)
                self.wfile.write(b"<html>" + posts_table.encode("utf-8") +
                                 b"</html>")
            else:
                keyword = query_dict["keyword"][0]
                if hanzidentifier.has_chinese(keyword):  #checks for language
                    dataframe = weibo_df
                else:
                    keyword = lemmatize(keyword.lower())
                    dataframe = twitter_df

                if keyword in english_vocab or keyword in chinese_vocab:  #checks if keyword in corpus
                    doc_indices = get_indices(keyword, dataframe)
                    results = get_posts(keyword, doc_indices, dataframe)
                    create_wc(keyword, doc_indices, "word_cloud.png",
                              dataframe)
                    posts_table = put_in_table(results)
                    random_num = str(random.randint(1, 1000))
                    self.wfile.write(b"<html>" + posts_table.encode("utf-8") +
                                     b'<img src="word_cloud.png?' +
                                     random_num.encode("utf-8") + b'">' +
                                     b"</html>")
                else:
                    message = f'Sorry, "{keyword}" is not in our corpus. Make sure the keyword is only one word and there are no spaces.'
                    self.wfile.write(b"<html>" + message.encode("utf-8") +
                                     b"</html>")
start = perf_counter()

Data = pd.read_csv(filename, sep=',', header=None, usecols=[0])
Data.columns = ["Chinese"]
end = perf_counter()
print(f"Took: {round(end-start,sigfigs=2)}")

Freqs = {}

print("finding freq...")
start = perf_counter()
for i in range(len(Data)):
    characters = list(Data["Chinese"][i])

    for character in characters:
        if hanzidentifier.has_chinese(character):
            if character not in Freqs.keys():
                Freqs[character] = 1
            else:
                Freqs[character] += 1
        else:
            pass
            #print(f"skipped {character}")

end = perf_counter()
print(f"Took: {round(end-start,sigfigs=2)}")

Freq_data = pd.DataFrame(columns=["Chinese", "Freq"])

print("rearranging...")
start = perf_counter()
Exemple #10
0
import os

import hanzidentifier  # to see if file contains chinese
import pinyin  # to convert chinese to english

# io.open("myfile.txt", 'r', encoding="windows-1252")
print("JUST DANCE KTAPE ENCRYPTOR BY YUNYL")
filename = input("Enter JDU ktape: ")

with io.open(filename, 'r', encoding="utf-8") as json_file:
    data = json.load(json_file)

# check all lyrics for chinese and convert to its english pronunciation
for track in data["Clips"]:
    # if the track contains chinese characters
    if hanzidentifier.has_chinese(track['Lyrics']):
        # convert to enlish
        track['Lyrics'] = pinyin.get(
            track['Lyrics'], format="strip", delimiter=" ") + " "

i = 0

songname = (json.dumps(data['MapName'], sort_keys=False, indent=4).lower())
songnamenormal = songname[1:-1]
songnameready = (("WII_" + songnamenormal))
songnamereadyt = songnameready + "_tml_karaoke.ktape.ckd"
filewhereput = open(songnamereadyt, "w")
howmanyclipsinktape = (len(data['Clips']))
howmanyclipsinktapeadd10000 = howmanyclipsinktape * 200 + 3514
howmanyclipsinktapehexreadywith10000 = (hex(
    int(howmanyclipsinktapeadd10000)).replace("0x", ""))
 def test_has_chinese(self):
     self.assertFalse(hanzidentifier.has_chinese(UNKNOWN))
     self.assertTrue(hanzidentifier.has_chinese(BOTH))
Exemple #12
0
def parse_file(filename_traditional, filename_simplified_jyutping, entries):
    simplified = traditional = []
    with open(filename_traditional, "r", encoding="utf8") as f:
        reader = csv.reader(f, delimiter="	")
        traditional = list(reader)

    # The Kaifangcidian data for jyutping is horrible.
    # The entire data is on a single line, printed like a flat Python list.
    # The entry may be a single item in the array, or multiple items.
    # The Jyutping pronunciation is a separate item for each character in the entry.
    # The translations to Mandarin may, or may not follow the Jyutping!
    # And there is no separator between data for different entries :)
    last_line = ""
    with open(filename_simplified_jyutping, "r", encoding="utf8") as f:
        last_line = f.readlines()[-1]
    simplified = ast.literal_eval(last_line)

    index = 0
    for row in range(len(traditional)):
        if row < 9:  # The first nine rows are comments and headers
            continue

        trad = traditional[row][0]

        # Horrible data workaround 1:
        # In KFCD Jyutping data, when the entry has Chinese characters in it,
        # the entry is presented as a single string in the array. (This is sane.)
        # If it does not (e.g. the word 'pat pat'), each series of characters, delineated
        # by a space, is a separate entry in the array ('pat pat' => ["pat", "pat"])
        trad_len = len(trad.split(" "))
        if not hanzidentifier.has_chinese(trad):
            simp = "".join(simplified[index:index + trad_len])
        else:
            simp = simplified[index]

        # Horrible data workaround 2:
        # In KFCD Jyutping data, the Jyutping for each word in an entry
        # is presented as a separate string.
        # To find the indices that correspond to the entry we just extracted,
        # use the data from the KFCD Yale edition (which is formatted as a CSV) to
        # determine how many items comprise the Jyutping pronunciation.
        # One cannot use the string length of the entry, as it may contain punctuation
        # (e.g. ',') that has no corresponding Jyutping syllable, AND the entry
        # may be split up into multiple items (as described in horrible
        # workaround #1).
        jyut_len = len(traditional[row][1].split(" "))
        jyut = " ".join(simplified[index + trad_len:index + trad_len +
                                   jyut_len])

        pin = (" ".join(
            lazy_pinyin(trad, style=Style.TONE3,
                        neutral_tone_with_five=True)).lower().replace(
                            "v", "u:"))

        # Horrible data workaround 3:
        # In the KFCD Yale data, all the definitions are listed as a single item, separated
        # by the wide-character ','. Some entries have definitions, and some do not.
        # In the KFCD Jyutping edition, the definitions are also listed all as a single item.
        # However, many words do not have definitions; if there are no definitions then
        # we do NOT need to advance the index by 1 more item (which would have been
        # the definitions).
        if traditional[row][2]:
            defs_traditional = traditional[row][2].split(",")
            defs_simplified = simplified[index + trad_len +
                                         jyut_len].split(",")
            definitions = []
            for (def_traditional,
                 def_simplified) in zip(defs_traditional, defs_simplified):
                if def_traditional != def_simplified:
                    definitions.append(def_traditional + " – " +
                                       def_simplified)
                else:
                    definitions.append(def_traditional)
            index += trad_len + jyut_len + 1
        else:
            definitions = ["(沒有對應漢語詞彙)"]
            index += trad_len + jyut_len

        entry = objects.Entry(trad=trad,
                              simp=simp,
                              pin=pin,
                              jyut=jyut,
                              defs=definitions)

        if trad in entries:
            entries[trad].append(entry)
        else:
            entries[trad] = [entry]
Exemple #13
0
for filename in os.listdir(hkcancor_path):
    file = open(os.path.join(hkcancor_path, filename))
    print("processing: " + filename)
    sentence = []
    x = True

    while True:
        line = file.readline()

        if "<sent_tag>" in line:
            x = True

        while x:
            for char in line:
                if has_chinese(char):
                    sentence.append(char)
                elif char == "。" or char == ",":
                    sentence.append(char)

            if "</sent_tag>" in line:
                labels.append(0) if random() <= 0.3 else labels.append(1)
                print(''.join(sentence))
                sentences.append(''.join(sentence))
                sentence = []
                x = False
                break

            break

        if not line: