def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1

        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1

        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
def stdOut(sentence_result_lists, dicts, scope):
    lists = list()
    j = 0
    sentence_nbr = len(dicts)
    sentence_result_lists_tmp = map(
        lambda (key, score): (key, score * (1 - math.log(
            (key + 1)) / math.log(sentence_nbr))), sentence_result_lists)
    sentence_result_lists2 = sorted(sentence_result_lists_tmp,
                                    key=lambda (key, score): score,
                                    reverse=True)
    try:
        for distattr3 in sentence_result_lists2:
            sentence_id = distattr3[0]
            tmp = dicts[sentence_id]
            tmp2 = filter(lambda x: is_chinese(x), tmp)
            if (len(tmp2) < 8 or contain_redundant(
                    redundant_dict='../resource/redundant_dict.txt',
                    string_with_redundant=tmp)):
                continue
            j += 1
            result_str = removePrefix(tmp.strip(" "), "”".decode("utf8"))
            result = distattr2(sentence_id, result_str)
            lists.append(result)
            if (j >= scope):
                break
        std = sorted(lists,
                     key=lambda x: 0.5 * len(x.strs) / (x.ids + 1),
                     reverse=True)
    except:
        std = lists
    return std
Beispiel #3
0
def change(filepath):
    f = open(filepath, 'r', encoding=testDecode(filepath))
    fnew = open(filepath[:-4] + '_new.txt', 'w+',
                encoding="utf-8")  # 将结果存入新的文本中
    fristLine = None

    while True:
        if not fristLine:  # fristline为空则读取
            fristLine = f.readline()

        if not fristLine:
            break

        newLine = fristLine.strip()
        # print(newLine)
        if len(newLine) != 0:  # 第一行有字
            secondLine = f.readline()
            if not secondLine:  #第二行没有,即为退出
                fnew.write(fristLine.strip())
                break

            while len(secondLine.strip()) == 0:
                secondLine = f.readline()
                if not secondLine:  #第二行没有,即为退出
                    fnew.write(fristLine.strip())
                    break

            firstLast = fristLine.strip()[-1]
            if firstLast in [
                    "。", "*", ":", ">", "」", ')', '?', '!', ')', '=', '”', '^',
                    '*', '】', '▲', '▽', '☆', '○', '¨', '╔', '?', '》', ';'
            ]:  # 能换段
                fnew.write(fristLine.rstrip())
                fnew.write("\n")
                fristLine = "    %s" % (secondLine.strip())
                pass
            elif firstLast in [",", "…", "、", '「', '(', "(", "<", '—'] \
                    or is_chinese(firstLast) or firstLast.isalnum():  # 不能换段
                fnew.write(fristLine.rstrip())
                fristLine = secondLine.strip()
                pass
            else:
                print("特殊字符 %s" % firstLast)
                fnew.write(fristLine.rstrip())
                fnew.write("\n")
                fristLine = "    %s" % (secondLine.strip())
            fnew.flush()
        else:  # 第一行为空,重读
            fristLine = f.readline()
    f.close()
    fnew.close()
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num / _base)

        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num
Beispiel #6
0
 def stdOut(self, rank, dicts, top):
     lists = list()
     j = 0
     try:
         for sentence_id in rank:
             tmp = dicts[sentence_id]
             tmp2 = filter(lambda x: is_chinese(x), tmp)
             if (len(tmp2) < 8 or contain_redundant(
                     redundant_dict='../resource/redundant_dict.txt',
                     string_with_redundant=tmp)):
                 continue
             j += 1
             result_str = removePrefix(tmp.replace(" ", ""),
                                       "”".decode("utf8"))
             result = distattr2(sentence_id, result_str)
             lists.append(result)
             if (j >= top):
                 break
         std = sorted(lists, key=lambda x: x.ids)
     except:
         std = lists
     return std
 def calc_x_offset(self, text, size):
     offset = sum([size * 1.6 for st in text if is_chinese(st)])
     offset += sum([size * 0.8 for st in text if not is_chinese(st)])
     return offset
Beispiel #8
0
def testcChinese():
    for str in "我1.。a":  # 只有“我”是true
        print("%s is chinese:%r" % (str, is_chinese(str)))
    pass
Beispiel #9
0
def main():
    # load data
    conn = MySQLdb.connect(host=args.host,
                           user=args.user,
                           passwd=args.passwd,
                           db=args.db,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('select id, content_html from t_crawler_obj limit ' +
                args.file[0] + ',' + args.file[1])
    data = cur.fetchall()

    # load model
    model = doc2vec.Doc2Vec.load(args.model)
    # parse data by beautiful soup
    dicts1 = dict()
    for line in data:
        ids, content_html = line
        content = BeautifulSoup(content_html, "html.parser")
        dicts1[ids] = content.get_text()

    # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string
    dicts2 = defaultdict(dict)
    for key, value in dicts1.items():
        lists = cut_sentence_new(value)
        dicttmp = dict()
        for key2, value2 in enumerate(lists):
            dicttmp[key2] = value2
        dicts2[key] = dicttmp

# split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list
    dicts3 = defaultdict(dict)
    analyse.set_stop_words('../resource/stop_words.txt')
    for key, value in dicts2.items():
        dicttmp = dict()
        for key2, value2 in value.items():
            seg_list = jieba.cut(
                string_parser(punc_file='../resource/punc_file.txt',
                              string_with_punc=value2))
            seg_list = filter(lambda x: x != " ", seg_list)
            lists = list(seg_list)
            if (len(lists) >= 3):  #save sentence with length greater than 3
                dicttmp[key2] = lists
        dicts3[key] = dicttmp


# vectorization and textrank

    for key, value in dicts3.items():
        dictrember = dict()
        X = list()
        i = 0
        for key2, value2 in value.items():
            dictrember[i] = key2  # i: X index; key2: sentence order
            X.append(model.infer_vector(value2))
            i += 1
        X = np.array(X, dtype='float32')
        distance_matrix = pairwise_distances(X, metric='cosine')
        rank = rankgetter(distance_matrix=distance_matrix,
                          dictrember=dictrember)
        j = 0
        try:
            lists = list()
            for info in rank:
                ind = info.ids  # sentence order
                tmp = dicts2[key][ind]
                tmp2 = filter(lambda x: is_chinese(x), tmp)
                if (len(tmp2) < 8 or contain_redundant(
                        redundant_dict='../resource/redundant_dict.txt',
                        string_with_redundant=dicts2[key][ind])):
                    continue
                j += 1
                result_str = removePrefix(dicts2[key][ind].replace(" ", ""),
                                          "”".decode("utf8"))
                result = distattr2(ind, result_str)
                lists.append(result)
                if (j >= args.top):
                    break

            stdOut = sorted(
                lists, key=lambda x: x.ids
            )  # print the result according to the order sentence
            for key3, sentence3 in enumerate(stdOut):
                print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs

        except:
            print("No More Qualified Sentence!")