コード例 #1
0
def init_train_test_data(path):
    """
    提取所有标注的数据,格式为字典 {para(原话):"", ([(实体,类型)])}
    :param path:
    :return:
    """
    pp = []
    paras = []
    result = []
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.readlines()
        for line in lines:
            sents = policy_util.cut_sentence(line)
            paras += sents
        for para in paras:
            p = re.sub("[A-Za-z0-9\$\#\*\]\[]", "", para)
            entity = extract_mark_label(para)
            if re.search(r'“.*”', p):
                event = str(re.search(r'“.*”', p).group())
                entity.append((event[1:-1], "Event"))
            if not entity:
                continue
            if len(p) > 100 or len(entity) < 2:
                continue
            pp.append(p)
            d = {'sent': p, 'entities': entity}
            result.append(d)
    print(len(set(pp)))
    return result
コード例 #2
0
def build_vocabulary(data: pd.DataFrame):
    counter = Counter()
    vocabulary = dict()
    for index, row in data.iterrows():
        para = row['content']
        sentences = policy_util.cut_sentence(para)
        for sent in sentences:
            for char in sent:
                counter[char] += 1
    num_most_common = len(counter)
    for char, _ in counter.most_common(num_most_common):
        vocabulary[char] = vocabulary.get(char, len(vocabulary))
    return vocabulary
コード例 #3
0
def get_vectorization_init(data):
    sents = []
    for index, row in data.iterrows():
        content = row['content']
        paras = policy_util.cut_sentence(content)
        for para in paras:
            sent = []
            words = posseg.cut(para)
            for word in words:
                if word.flag != 'w':
                    sent.append(word.word)
            sents.append(sent)
    return sents
コード例 #4
0
def get_entity_para(content, entity):
    if not content:
        return []
    paras = policy_util.cut_sentence(content)
    result = []
    include_entity = []
    for para in paras:
        entities = []
        count = 0
        for e in entity:
            if para.find(e) >= 0:
                entities.append(e)
                count += 1
        if count >= 2:
            result.append(para)
            include_entity.append(entities)
    return result, include_entity
コード例 #5
0
def tf_idf_statistic(data):
    """
    使用TF-IDF判断文章的词频
    :param data:
    :return:
    """
    all_word = []
    for index, row in data.iterrows():
        content = row['content']
        paras = policy_util.cut_sentence(content)
        words = extract_entity(paras)
        word = ' '.join(words)
        all_word.append(word)
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(all_word))
    word = vectorizer.get_feature_names()
    weight = tfidf.toarray()
    return word, weight
コード例 #6
0
def extract_entity(paragraphes):
    """
    从一个句子数组里提取名词词语
    :param paragraphes:
    :return:
    """
    extract_term = []
    for para in paragraphes:
        sentences = policy_util.cut_sentence(para)
        for sent in sentences:
            print(sent)
            term = []
            sent_seged = jieba.posseg.cut(sent.strip())
            for s in sent_seged:
                sege = str(s.flag)
                if (sege.find('n') >= 0
                        or sege.find('j') >= 0) and sege.find('v') < 0:
                    term.append(s.word)
            extract_term.append(term)
    extract_term = [str(term) for term in extract_term]
    return extract_term
コード例 #7
0
def condition_extract(data):
    json_res = []
    for index, row in data.iterrows():
        content = row['content']
        paras = policy_util.cut_sentence(content)
        res = []
        for para in paras:
            if re.search(r"\d+%|\d+.*元|\d+天|\d+工作日", para):
                res.append(para)
        conditions = '|'.join(res)
        print(index)
        print(conditions)
        r = {
            'policy_id': index + 1,
            'content': str(row['content_html']),
            'condition': conditions
        }
        json_res.append(json.dumps(r, ensure_ascii=False))
    with open("data/policy_content.json", 'w', encoding='utf-8') as fp:
        for j in json_res:
            fp.write(j + '\n')