예제 #1
0
def annotate_sentences(model, sents):
    cared_sents = []

    embeddings = word2vec(sents)  # If use USC
    for i in range(len(sents)):
        sentence = sents[i]
        if finding_by_heuristics(sentence):
            cared_sents.append(sentence)
            continue
        # Word2Vec feature
        feature1 = embeddings[i].tolist()  # If use USC
        # feature1 = word2vec(sentence)        # Use law2vec
        # Basic features: sentence length, number of periods, percent of characters that are capitalized
        feature2 = basic_features(sentence)
        # Fraction of POS feature
        feature3 = frac_of_pos(sentence)
        # Fraction of years and numbers feature
        feature4 = frac_of_years_and_numbers(sentence)
        # Cue words feature
        feature5 = cue_words(sentence)

        features = np.array(feature1 + feature2 + feature3 + feature4 +
                            feature5).reshape(1, -1)
        type = model.predict(features)
        if type[0] == 1:
            cared_sents.append(sentence)

    return cared_sents
def combine_features(cases_names):
    types = [
        'Citation', 'LegalRule', 'LegalPolicy', 'PolicyBasedReasoning',
        'ConclusionOfLaw', 'EvidenceBasedFinding', 'EvidenceBasedReasoning',
        'Evidence', 'Procedure', 'Header'
    ]
    type_set = set(types)

    x = []
    y = []
    for case_name in cases_names:
        annotated_sentences = read_files([case_name])
        if not os.path.exists('../annotated_casetext/' + case_name[:-4] +
                              '.pickle'):
            pure_sentences = [
                annotated_sentences[i][0]
                for i in range(len(annotated_sentences))
            ]
            embeddings = word2vec(pure_sentences)
            # Save embeddings to speed up
            with open('../annotated_casetext/' + case_name[:-4] + '.pickle',
                      'wb') as handle:
                pickle.dump(embeddings, handle, protocol=2)
        else:
            with open('../annotated_casetext/' + case_name[:-4] + '.pickle',
                      'rb') as handle:
                embeddings = pickle.load(handle)

        for i in list(range(len(annotated_sentences))):
            sentence, sent_type = annotated_sentences[i]

            label = 0
            if sent_type == 'EvidenceBasedFinding':
                label = 1
            elif sent_type != 'Evidence':
                continue
            y.append(label)

            # Word2Vec feature
            feature1 = embeddings[i].tolist()
            # Basic features: sentence length, number of periods, percent of characters that are capitalized
            feature2 = basic_features(sentence)
            # Fraction of POS feature
            feature3 = frac_of_pos(sentence)
            # Fraction of years and numbers feature
            feature4 = frac_of_years_and_numbers(sentence)
            # Cue words feature
            feature5 = cue_words(sentence)

            features = feature1 + feature2 + feature3 + feature4 + feature5
            x.append(np.array(features))

    x = np.array(x)
    y = np.array(y)
    return x, y
예제 #3
0
def combine_features(cases_names):
    x = []
    y = []
    for case_name in cases_names:
        annotated_sentences = read_files([case_name], annotated_dir)
        if not os.path.exists(annotated_dir + case_name[:-4] + '.pickle'):
            pure_sentences = [
                annotated_sentences[i][0]
                for i in range(len(annotated_sentences))
            ]
            try:
                embeddings = word2vec(pure_sentences)
            except:
                print("USC failed", case_name)
                continue
            # Save embeddings to speed up
            with open(annotated_dir + case_name[:-4] + '.pickle',
                      'wb') as handle:
                pickle.dump(embeddings, handle, protocol=2)
        else:
            with open(annotated_dir + case_name[:-4] + '.pickle',
                      'rb') as handle:
                embeddings = pickle.load(handle)

        for i in range(len(annotated_sentences)):
            sentence, sent_type = annotated_sentences[i]

            if sent_type in finding_types or sent_type in fact_types:
                label = 1
            # elif sent_type in fact_types:
            #     label = 2
            else:
                label = 0
            y.append(label)

            # Word2Vec feature
            feature1 = embeddings[i].tolist()
            # Basic features: sentence length, number of periods, percent of characters that are capitalized
            feature2 = basic_features(sentence)
            # Fraction of POS feature
            feature3 = frac_of_pos(sentence)
            # Fraction of years and numbers feature
            feature4 = frac_of_years_and_numbers(sentence)
            # Cue words feature
            feature5 = cue_words(sentence)

            features = feature1 + feature2 + feature3 + feature4 + feature5
            x.append(np.array(features))

    x = np.array(x)
    y = np.array(y)
    return x, y