Esempio n. 1
0
def make_cal_pos_hashtable(cutoff=0):
    obj = {}
    with open(full_cal_db_location, 'rb') as cal:
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line, False, False)
            except IndexError:
                print line
                continue
            word = lineObj["word"]
            pos = lineObj["POS"]
            if not word in obj:
                obj[word] = []

            #pos_set = set(obj[word])
            #pos_set.add(pos)
            obj[word].append(pos)

    num_one_pos_words = 0
    total_num_pos = 0
    for word, pos in reversed(obj.items()):
        pos_counts = {}
        for p in pos:
            if not p in pos_counts:
                pos_counts[p] = 0
            pos_counts[p] += 1
        obj[word] = pos_counts
        if len(pos_counts) < cutoff:
            del obj[word]
            continue
        total_num_pos += len(pos_counts)
        if len(pos_counts) == 1:
            num_one_pos_words += 1

    print "Percent Words With 1 POS", round(
        100.0 * num_one_pos_words / len(obj), 3)
    print "Avg Num POS per word", round(1.0 * total_num_pos / len(obj), 3)

    cal_tools.saveUTFStr(obj, "cal_pos_hashtable.json")
    f = codecs.open("double_pos_before_eng.txt", "wb", encoding='utf8')
    for word, pos in obj.items():
        f.write(u'{} ~-~ {}\n'.format(word, str(pos)))
    f.close()
Esempio n. 2
0
def make_cal_pos_hashtable(cutoff=0):
    obj = {}
    with open(full_cal_db_location,'rb') as cal:
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line,False,False)
            except IndexError:
                print line
                continue
            word = lineObj["word"]
            pos = lineObj["POS"]
            if not word in obj:
                obj[word] = []

            #pos_set = set(obj[word])
            #pos_set.add(pos)
            obj[word].append(pos)

    num_one_pos_words = 0
    total_num_pos = 0
    for word,pos in reversed(obj.items()):
        pos_counts = {}
        for p in pos:
            if not p in pos_counts:
                pos_counts[p] = 0
            pos_counts[p] += 1
        obj[word] = pos_counts
        if len(pos_counts) < cutoff:
            del obj[word]
            continue
        total_num_pos += len(pos_counts)
        if len(pos_counts) == 1:
            num_one_pos_words += 1

    print "Percent Words With 1 POS",round(100.0*num_one_pos_words/len(obj),3)
    print "Avg Num POS per word",round(1.0*total_num_pos/len(obj),3)

    cal_tools.saveUTFStr(obj,"cal_pos_hashtable.json")
    f = codecs.open("double_pos_before_eng.txt","wb",encoding='utf8')
    for word,pos in obj.items():
        f.write(u'{} ~-~ {}\n'.format(word,str(pos)))
    f.close()
Esempio n. 3
0
def make_cal_segments(mesechta):
    def get_daf_str(daf_num, daf_side_num):
        return '{}{}'.format(daf_num, 'a' if daf_side_num == 1 else 'b')

    cal_gem_lines = []
    with open("{}{}.txt".format(mesechta_cal_db_location, mesechta),
              "rb") as f:
        temp_gem_line = []
        curr_gem_line_num = -1
        curr_daf = ''
        for line in f:
            line_obj = cal_tools.parseCalLine(line, True, False)
            line_obj["daf"] = get_daf_str(
                line_obj['pg_num'], line_obj['side'])  #add a daf str prop
            line_obj["word"] = line_obj["word"].replace("'", '"')
            if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"':
                line_obj["word"] = line_obj["word"][0:
                                                    -1]  #remove abbreviations

            if line_obj["line_num"] != curr_gem_line_num:
                if len(temp_gem_line) > 0:
                    small_gem_lines = [temp_gem_line]
                    has_big_lines = True

                    #recursively split up big lines until they're not big
                    while has_big_lines:
                        has_big_lines = False
                        new_small_gem_lines = []
                        for gem_line in small_gem_lines:
                            if len(gem_line) > 5:
                                has_big_lines = True
                                cut_index = len(gem_line) / 2
                                new_small_gem_lines.append(
                                    gem_line[:cut_index])
                                new_small_gem_lines.append(
                                    gem_line[cut_index:])
                            else:
                                new_small_gem_lines.append(gem_line)
                        small_gem_lines = new_small_gem_lines
                    for gem_line in small_gem_lines:
                        cal_gem_lines.append(gem_line)
                temp_gem_line = [line_obj]
                curr_gem_line_num = line_obj["line_num"]
            else:
                temp_gem_line.append(line_obj)
    '''
    #clean up lines with only 1 or 2 words
    new_cal_gem_lines = []
    new_cal_gem_dafs = []

    for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)):
        cal_line = clt[0], line_num = clt[1], daf = clt[2]
        if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1:
            p_cal_line = cal_gem_lines[i-1]
        else:
            p_cal_line = None

        if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1:
            n_cal_line = cal_gem_lines[i+1]
        else:
            n_cal_line = None

        if len(cal_line) <= 2
    '''

    #break up by daf, concat lines to strs
    all_daf_lines = []
    all_dafs = []
    curr_daf = ''
    curr_daf_lines = []
    for iline, line in enumerate(cal_gem_lines):
        if line[0]["daf"] != curr_daf:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)
            curr_daf = line[0]["daf"]
            curr_daf_lines = [line]
        else:
            curr_daf_lines.append(line)

        # dont forget to add the last daf in
        if iline == len(cal_gem_lines) - 1:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)

    cal_tools.saveUTFStr({
        "lines": all_daf_lines,
        "dafs": all_dafs
    }, "cal_lines_{}.json".format(mesechta))
Esempio n. 4
0
def make_cal_segments(mesechta):

    def get_daf_str(daf_num,daf_side_num):
        return '{}{}'.format(daf_num,'a' if daf_side_num == 1 else 'b')

    cal_gem_lines = []
    with open("{}{}.txt".format(mesechta_cal_db_location, mesechta), "rb") as f:
        temp_gem_line = []
        curr_gem_line_num = -1
        curr_daf = ''
        for line in f:
            line_obj = cal_tools.parseCalLine(line,True,False)
            line_obj["daf"] = get_daf_str(line_obj['pg_num'],line_obj['side']) #add a daf str prop
            line_obj["word"] = line_obj["word"].replace("'",'"')
            if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"':
                line_obj["word"] = line_obj["word"][0:-1] #remove abbreviations

            if line_obj["line_num"] != curr_gem_line_num:
                if len(temp_gem_line) > 0:
                    small_gem_lines = [temp_gem_line]
                    has_big_lines = True

                    #recursively split up big lines until they're not big
                    while has_big_lines:
                        has_big_lines = False
                        new_small_gem_lines = []
                        for gem_line in small_gem_lines:
                            if len(gem_line) > 5:
                                has_big_lines = True
                                cut_index = len(gem_line)/2
                                new_small_gem_lines.append(gem_line[:cut_index])
                                new_small_gem_lines.append(gem_line[cut_index:])
                            else:
                                new_small_gem_lines.append(gem_line)
                        small_gem_lines = new_small_gem_lines
                    for gem_line in small_gem_lines:
                        cal_gem_lines.append(gem_line)
                temp_gem_line = [line_obj]
                curr_gem_line_num = line_obj["line_num"]
            else:
                temp_gem_line.append(line_obj)

    '''
    #clean up lines with only 1 or 2 words
    new_cal_gem_lines = []
    new_cal_gem_dafs = []

    for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)):
        cal_line = clt[0], line_num = clt[1], daf = clt[2]
        if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1:
            p_cal_line = cal_gem_lines[i-1]
        else:
            p_cal_line = None

        if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1:
            n_cal_line = cal_gem_lines[i+1]
        else:
            n_cal_line = None

        if len(cal_line) <= 2
    '''

    #break up by daf, concat lines to strs
    all_daf_lines = []
    all_dafs = []
    curr_daf = ''
    curr_daf_lines = []
    for line in cal_gem_lines:
        if line[0]["daf"] != curr_daf:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)
            curr_daf = line[0]["daf"]
            curr_daf_lines = [line]
        else:
            curr_daf_lines.append(line)

    cal_tools.saveUTFStr({"lines":all_daf_lines,"dafs":all_dafs},"cal_lines_{}.json".format(mesechta))