forked from ivankoval/ReadAbility
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ent_features_eng.py
56 lines (42 loc) · 1.69 KB
/
ent_features_eng.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from __future__ import division
import nltk
import collections
import numpy as np
import time
from common_functions import get_words, total_sentences, prepare_dataset
from pymongo import MongoClient
def extract_entities(text):
named_entities = []
for sent in nltk.sent_tokenize(text):
for chunk in nltk.chunk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary=True):
if isinstance(chunk, nltk.tree.Tree):
if chunk.label() == 'NE':
for entity in chunk.leaves():
named_entities.append(entity[0])
entities = collections.namedtuple('Entities', ['ne', 'unique_ne'])
return entities(len(named_entities), len(np.unique(named_entities)))
def extract_features(data):
extr_entities = extract_entities(data)
ne = extr_entities.ne
tw = len(get_words(data))
ts = total_sentences(data)
feature1 = ne/tw*100
feature2 = ne/ts*100
print str(feature1) + " " + str(feature2)
return [feature1, feature2]
def get_test_data():
# grades = ['K-1', '4-5', '9-10']
grades = ['2-3', '6-8', '11-CCR']
# grades = ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR']
path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/"
dataset = prepare_dataset(path_to_data, grades)
client = MongoClient('mongodb://localhost:27017/')
features_collection = client.features['ent-eng']
features_collection.drop()
for text in dataset:
text_features = {"grade": text.grade,
"features": extract_features(text.data)}
features_collection.insert_one(text_features)
start = time.time()
get_test_data()
print str(time.time() - start) + " sec"