/
annotate.py
147 lines (112 loc) · 3.55 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import rdt.data.clean.html as clean
import rdt.data.mongo.source as rdtcorp
import rdt.nlp.ngrams as ngrams
import rdt.nlp.pos as pos
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll
import rdt.data.clean.html as html
import json, nltk
def dirty_dict(doc,tagger=None):
""" Make clean a dictionary and annotate.
:param doc: A dictionary without cleansed_text added
:type doc: dict
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: dict
"""
return clean_dict(html.clean_doc(doc),tagger=tagger)
def dirty_dicts(docs,tagger=None):
""" Make clean many dictionaries and annotate.
:param docs: Dictionaries without cleansed_text.
:type docs: [{docs}]
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: generator(dict)
"""
for doc in docs:
yield dirty_dict(doc, tagger=tagger)
def clean_dict(doc,tagger=nltk.pos_tag):
""" Processes NLP features from cleansed_text. All other functions
wrap this one.
Serves to act as the NLP-front end for reddit corpus
parsing. Dictionaries and json strings are accepted and return
dictionaries containing additional information. The processing
done here represents the general annotations. The following
are the new fields added to the dictionary. Classifiers
will work to modify or wrap these methods.
::
{
conlltags : [[(word, pos, BIO)]],
nouns : [word],
named_entities : [[word, pos, BIO]],
cleansed_text : [[word]]
}
:param doc: dictionary of reddit corpus.
:type doc: dict
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: dict
"""
if "_id" in doc: del(doc["_id"])
sentences = pos.tokenize_sents(doc["cleansed_text"])
tags = pos.tokenize_words(sentences) or []
doc["conlltags"] = []
doc["nouns"] = []
doc["named_entities"] = []
for sent in tags:
tagged_sent = nltk.pos_tag(sent) or []
d = ne_chunk(tagged_sent) or []
chunks = tree2conlltags(d)
doc["conlltags"].append(chunks)
doc["nouns"].extend(cnll.get_nouns(chunks))
doc["named_entities"].extend(cnll.get_ne(chunks))
return doc
def clean_dicts(docs,tagger=None):
""" Returns the annotated version of the document.
:param docs: Dictionarie to be annotated:
:type docs: [{}]
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: iter([dict])
"""
for doc in enumerate(docs):
yield clean_dict(doc,tagger=tagger)
def dirty_json(doc,tagger=None):
""" Returns the annotated and clean version of the document.
:param doc: json string without cleansed text or annotations
:type doc: str
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: dict
"""
return clean_dict(html.clean_doc(json.loads(doc)),tagger=tagger)
def dirty_jsons(docs,tagger=None):
""" Returns the annotated and clean versions of the documents.
:param docs: List of unannotated json strings
:type docs: ["{json_string}"]
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: generator(dict)
"""
for doc in docs:
yield dirty_json(doc,tagger=tagger)
def clean_json(doc,tagger=None):
""" Returns the annotated form of the json document.
:param doc: A dictionary with cleansed_text
:type doc: str
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: dict
"""
return clean_dict(json.loads(doc),tagger=tagger)
def clean_jsons(docs,tagger=None):
""" Returns the annotated form of the json documents.
:param docs: list of json documents containing cleansed_text fields
:type docs: [str]
:param tagger: A pos tagger.
:type tagger: Tagger
:returns: iter([dict])
"""
for doc in docs:
yield clean_json(doc,tagger=tagger)