-
Notifications
You must be signed in to change notification settings - Fork 0
/
docimport.py
executable file
·149 lines (111 loc) · 3.96 KB
/
docimport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
from __future__ import with_statement
from itertools import izip_longest
'''
Simple interface to for importing files into the data directory.
Author: Pontus Stenetorp <pontus is s u-tokyo ac jp>
Version: 2011-02-21
'''
from annotation import open_textfile
from common import ProtocolError
#from config import DATA_DIR
from document import real_directory
from annotation import JOINED_ANN_FILE_SUFF, TEXT_FILE_SUFFIX
from os.path import join as join_path
from os.path import isdir, isfile
from os import access, W_OK
from lemmatizer import lemmatizer
from lemmatizer2 import lemmatizer2
from freeling2conll import conll
from conll2standoff import standoff_main
### Constants
DEFAULT_IMPORT_DIR = 'import'
###
class InvalidDirError(ProtocolError):
def __init__(self, path):
self.path = path
def __str__(self):
return 'Invalid directory'
def json(self, json_dic):
json_dic['exception'] = 'invalidDirError'
return json_dic
class FileExistsError(ProtocolError):
def __init__(self, path):
self.path = path
def __str__(self):
return 'File exists: %s' % self.path
def json(self, json_dic):
json_dic['exception'] = 'fileExistsError'
return json_dic
class NoWritePermissionError(ProtocolError):
def __init__(self, path):
self.path = path
def __str__(self):
return 'No write permission to %s' % self.path
def json(self, json_dic):
json_dic['exception'] = 'noWritePermissionError'
return json_dic
#TODO: Chop this function up
def save_import(text, docid, collection=None):
'''
TODO: DOC:
'''
directory = collection
#print directory
if directory is None:
dir_path = DATA_DIR
else:
#XXX: These "security" measures can surely be fooled
if (directory.count('../') or directory == '..'):
raise InvalidDirError(directory)
dir_path = real_directory(directory)
# Is the directory a directory and are we allowed to write?
if not isdir(dir_path):
raise InvalidDirError(dir_path)
if not access(dir_path, W_OK):
raise NoWritePermissionError(dir_path)
base_path = join_path(dir_path, docid)
#print base_path
txt_path = base_path + '.' + TEXT_FILE_SUFFIX
ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF
# Before we proceed, verify that we are not overwriting
for path in (txt_path, ann_path):
if isfile(path):
raise FileExistsError(path)
# Make sure we have a valid POSIX text file, i.e. that the
# file ends in a newline.
if text != "" and text[-1] != '\n':
text = text + '\n'
lemmatized_text1 = lemmatizer(text)
real_lemmatized_text = lemmatizer2(text)
lemmatized_text = list(izip_longest(lemmatized_text1,real_lemmatized_text))
conll_text = conll(lemmatized_text)
standoff_main(conll_text,docid)
return { 'document': docid }
if __name__ == '__main__':
# TODO: Update these to conform with the new API
'''
from unittest import TestCase
from tempfile import mkdtemp
from shutil import rmtree
from os import mkdir
class SaveImportTest(TestCase):
test_text = 'This is not a drill, this is a drill *BRRR!*'
test_dir = 'test'
test_filename = 'test'
def setUp(self):
self.tmpdir = mkdtemp()
mkdir(join_path(self.tmpdir, SaveImportTest.test_dir))
mkdir(join_path(self.tmpdir, DEFAULT_IMPORT_DIR))
def tearDown(self):
rmtree(self.tmpdir)
def test_import(self):
save_import(SaveImportTest.test_text, SaveImportTest.test_filename,
relative_dir=SaveImportTest.test_dir,
directory=self.tmpdir)
def test_default_import_dir(self):
save_import(SaveImportTest.test_text, SaveImportTest.test_filename,
directory=self.tmpdir)
import unittest
unittest.main()
'''