Example #1
0
def make_cal_db_word_list(text_name):
    word_list_out = []
    line_len_list = []
    curr_line = -1
    num_words_on_line = 0
    with open("caldb_{}.txt".format(text_name), "r") as caldb:
        for line in caldb:
            line_obj = cal_tools.parseCalLine(line,True,False)
            word = abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else line_obj["word"]
            word_list = word.split(" ")
            if curr_line == -1:
                curr_line = line_obj["line_num"]
            if curr_line != line_obj["line_num"]:
                line_len_list.append(num_words_on_line)
                num_words_on_line = len(word_list)
                curr_line = line_obj["line_num"]
            else:
                num_words_on_line += len(word_list)
            for w in word_list:
                word_list_out.append(w)
            #unabbrev_list_out.append(abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else "")
            prefix_list = [pre.replace("_","") for pre in line_obj["prefix"]] if "prefix" in line_obj else []
            #hword_list_out.append("".join(prefix_list) + line_obj["head_word"])
    line_len_list.append(num_words_on_line) #for the last line
    doc = {"words":word_list_out,"line_lens":line_len_list}
    fp = codecs.open("caldb_words_{}.json".format(text_name), "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
Example #2
0
def make_pos_hashtable_cal():
    out = {}
    with open("caldb.txt","r") as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line,False,False)
            word = line_obj["word"]
            head_word = line_obj["head_word"]
            pos = line_obj["POS"]

            if pos[0] == 'p':
                pos = "P" + pos[1:]
            if word in out:

                pos_list = out[word]["POS"]
                hw_list = out[word]["head_word"]
                if not pos in pos_list and not head_word in hw_list:
                    pos_list.append(pos)
                    hw_list.append(head_word)
                    out[word]["POS"] = pos_list
                    out[word]["head_word"] = hw_list

            else:
                out[word] = {
                    "POS" : [pos],
                    "head_word" : [head_word]
                }

    for key in out:
        if len(out[key]["POS"]) > 1:
            print u"^{}^".format(key), u"*-*".join(out[key]["head_word"]), out[key]["POS"]



    cal_tools.saveUTFStr(out,"cal_pos_hashtable.json")
Example #3
0
def make_pos_hashtable_cal():
    out = {}
    with open("caldb.txt","r") as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line,False,False)
            word = line_obj["word"]
            head_word = line_obj["head_word"]
            pos = line_obj["POS"]

            if pos[0] == 'p':
                pos = "P" + pos[1:]
            if word in out:

                pos_list = out[word]["POS"]
                hw_list = out[word]["head_word"]
                if not pos in pos_list and not head_word in hw_list:
                    pos_list.append(pos)
                    hw_list.append(head_word)
                    out[word]["POS"] = pos_list
                    out[word]["head_word"] = hw_list

            else:
                out[word] = {
                    "POS" : [pos],
                    "head_word" : [head_word]
                }

    for key in out:
        if len(out[key]["POS"]) > 1:
            print u"^{}^".format(key), u"*-*".join(out[key]["head_word"]), out[key]["POS"]



    cal_tools.saveUTFStr(out,"cal_pos_hashtable.json")
Example #4
0
def make_cal_db_word_list(text_name):
    word_list_out = []
    line_len_list = []
    curr_line = -1
    num_words_on_line = 0
    with open("caldb_{}.txt".format(text_name), "r") as caldb:
        for line in caldb:
            line_obj = cal_tools.parseCalLine(line,True,False)
            word = abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else line_obj["word"]
            word_list = word.split(" ")
            if curr_line == -1:
                curr_line = line_obj["line_num"]
            if curr_line != line_obj["line_num"]:
                line_len_list.append(num_words_on_line)
                num_words_on_line = len(word_list)
                curr_line = line_obj["line_num"]
            else:
                num_words_on_line += len(word_list)
            for w in word_list:
                word_list_out.append(w)
            #unabbrev_list_out.append(abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else "")
            prefix_list = [pre.replace("_","") for pre in line_obj["prefix"]] if "prefix" in line_obj else []
            #hword_list_out.append("".join(prefix_list) + line_obj["head_word"])
    line_len_list.append(num_words_on_line) #for the last line
    doc = {"words":word_list_out,"line_lens":line_len_list}
    fp = codecs.open("caldb_words_{}.json".format(text_name), "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
Example #5
0
def make_cal_db_word_list(text_name):
    word_list_out = []
    hword_list_out = []
    with open("caldb_{}.txt".format(text_name), "r") as caldb:
        for line in caldb:
            line_obj = cal_tools.parseCalLine(line,True,False)
            word_list_out.append(line_obj["word"])
            prefix_list = [pre.replace("_","") for pre in line_obj["prefix"]] if "prefix" in line_obj else []
            hword_list_out.append("".join(prefix_list) + line_obj["head_word"])
    doc = {"words":word_list_out,"head_words":hword_list_out}
    fp = codecs.open("caldb_words_{}.json".format(text_name), "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)