def annotate_sentences(model, sents): cared_sents = [] embeddings = word2vec(sents) # If use USC for i in range(len(sents)): sentence = sents[i] if finding_by_heuristics(sentence): cared_sents.append(sentence) continue # Word2Vec feature feature1 = embeddings[i].tolist() # If use USC # feature1 = word2vec(sentence) # Use law2vec # Basic features: sentence length, number of periods, percent of characters that are capitalized feature2 = basic_features(sentence) # Fraction of POS feature feature3 = frac_of_pos(sentence) # Fraction of years and numbers feature feature4 = frac_of_years_and_numbers(sentence) # Cue words feature feature5 = cue_words(sentence) features = np.array(feature1 + feature2 + feature3 + feature4 + feature5).reshape(1, -1) type = model.predict(features) if type[0] == 1: cared_sents.append(sentence) return cared_sents
def combine_features(cases_names): types = [ 'Citation', 'LegalRule', 'LegalPolicy', 'PolicyBasedReasoning', 'ConclusionOfLaw', 'EvidenceBasedFinding', 'EvidenceBasedReasoning', 'Evidence', 'Procedure', 'Header' ] type_set = set(types) x = [] y = [] for case_name in cases_names: annotated_sentences = read_files([case_name]) if not os.path.exists('../annotated_casetext/' + case_name[:-4] + '.pickle'): pure_sentences = [ annotated_sentences[i][0] for i in range(len(annotated_sentences)) ] embeddings = word2vec(pure_sentences) # Save embeddings to speed up with open('../annotated_casetext/' + case_name[:-4] + '.pickle', 'wb') as handle: pickle.dump(embeddings, handle, protocol=2) else: with open('../annotated_casetext/' + case_name[:-4] + '.pickle', 'rb') as handle: embeddings = pickle.load(handle) for i in list(range(len(annotated_sentences))): sentence, sent_type = annotated_sentences[i] label = 0 if sent_type == 'EvidenceBasedFinding': label = 1 elif sent_type != 'Evidence': continue y.append(label) # Word2Vec feature feature1 = embeddings[i].tolist() # Basic features: sentence length, number of periods, percent of characters that are capitalized feature2 = basic_features(sentence) # Fraction of POS feature feature3 = frac_of_pos(sentence) # Fraction of years and numbers feature feature4 = frac_of_years_and_numbers(sentence) # Cue words feature feature5 = cue_words(sentence) features = feature1 + feature2 + feature3 + feature4 + feature5 x.append(np.array(features)) x = np.array(x) y = np.array(y) return x, y
def combine_features(cases_names): x = [] y = [] for case_name in cases_names: annotated_sentences = read_files([case_name], annotated_dir) if not os.path.exists(annotated_dir + case_name[:-4] + '.pickle'): pure_sentences = [ annotated_sentences[i][0] for i in range(len(annotated_sentences)) ] try: embeddings = word2vec(pure_sentences) except: print("USC failed", case_name) continue # Save embeddings to speed up with open(annotated_dir + case_name[:-4] + '.pickle', 'wb') as handle: pickle.dump(embeddings, handle, protocol=2) else: with open(annotated_dir + case_name[:-4] + '.pickle', 'rb') as handle: embeddings = pickle.load(handle) for i in range(len(annotated_sentences)): sentence, sent_type = annotated_sentences[i] if sent_type in finding_types or sent_type in fact_types: label = 1 # elif sent_type in fact_types: # label = 2 else: label = 0 y.append(label) # Word2Vec feature feature1 = embeddings[i].tolist() # Basic features: sentence length, number of periods, percent of characters that are capitalized feature2 = basic_features(sentence) # Fraction of POS feature feature3 = frac_of_pos(sentence) # Fraction of years and numbers feature feature4 = frac_of_years_and_numbers(sentence) # Cue words feature feature5 = cue_words(sentence) features = feature1 + feature2 + feature3 + feature4 + feature5 x.append(np.array(features)) x = np.array(x) y = np.array(y) return x, y