def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from httplib import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(url_soup.path), data, headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally:
def create_comment(collection, document, id, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX # XXX what is this doing here? # path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() _set_special_comments(ann_obj, id, comment, mods, undo_resp=undo_resp) mods_json = mods.json_response() if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann = _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning( 'create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def getAnnObject2(collection,document): '''newest version of the getAnnObject methode''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(collection) except: real_dir=collection app_path = WORK_DIR + "/application/" ann = None full_name = collection + document full_name = full_name.replace("/","") if( isfile(app_path+full_name)): temp=open (app_path+full_name , 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir+document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia=get_extra_info(collection,document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: import os import simplejson as json import session docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = [] issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] ann.issues = issues temp=open (app_path+full_name , 'wb') pickle_dump(ann, temp) temp.close() return ann
def getAnnObject(collection, document): try: real_dir = real_directory(collection) except: real_dir = collection app_path = WORK_DIR + "/application/" full_name = collection + document full_name = full_name.replace("/", "") if (os.path.isfile(app_path + full_name)): temp = open(app_path + full_name, 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir + document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia = get_extra_info(collection, document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: validation failed: %s' % e) ann.issues = issues temp = open(app_path + full_name, 'wb') pickle_dump(ann, temp) temp.close() return ann
def get_status(directory, document): with TextAnnotations(path_join(real_directory, document), read_only=True) as ann: # XXX: Assume the last one is correct if we have more # than one (which is a violation of protocol anyway) statuses = [c for c in ann.get_statuses()] if statuses: status = statuses[-1].target else: status = None json_dic = {'status': status} return json_dic
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: conn.request( 'POST', url_soup.path, # The document text as body ann_obj.get_document_text().encode('utf8'), headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) finally:
def _document_json_dict(document): # TODO: DOC! # pointing at directory instead of document? if isdir(document): raise IsDirectoryError(document) j_dic = {} _enrich_json_with_base(j_dic) # TODO: We don't check if the files exist, let's be more error friendly # Read in the textual data to make it ready to push _enrich_json_with_text(j_dic, document + '.' + TEXT_FILE_SUFFIX) # _enrich_json_with_xml(j_dic, document + '.xml') # 文档标注的位置获取 with TextAnnotations(document) as ann_obj: # Note: At this stage the sentence offsets can conflict with the # annotations, we thus merge any sentence offsets that lie within # annotations # XXX: ~O(tb_ann * sentence_breaks), can be optimised # XXX: The merge strategy can lead to unforeseen consequences if two # sentences are not adjacent (the format allows for this: # S_1: [0, 10], S_2: [15, 20]) s_breaks = j_dic['sentence_offsets'] for tb_ann in ann_obj.get_textbounds(): s_i = 0 while s_i < len(s_breaks): s_start, s_end = s_breaks[s_i] # Does any subspan of the annotation strech over the # end of the sentence? found_spanning = False for tb_start, tb_end in tb_ann.spans: if tb_start < s_end and tb_end > s_end: found_spanning = True break if found_spanning: # Merge this sentence and the next sentence s_breaks[s_i] = (s_start, s_breaks[s_i + 1][1]) del s_breaks[s_i + 1] else: s_i += 1 _enrich_json_with_data(j_dic, ann_obj) return j_dic
def set_status(directory, document, status=None): real_dir = real_directory(directory) with TextAnnotations(path_join(real_dir, document)) as ann: # Erase all old status annotations for status in ann.get_statuses(): ann.del_annotation(status) if status is not None: # XXX: This could work, not sure if it can induce an id collision new_status_id = ann.get_new_id('#') ann.add_annotation( OnelineCommentAnnotation(new_status, new_status_id, 'STATUS', '')) json_dic = {'status': new_status} return json_dic
def _document_json_dict(document): #TODO: DOC! # pointing at directory instead of document? if isdir(document): raise IsDirectoryError(document) j_dic = {} _enrich_json_with_base(j_dic) #TODO: We don't check if the files exist, let's be more error friendly # Read in the textual data to make it ready to push _enrich_json_with_text(j_dic, document + '.' + TEXT_FILE_SUFFIX) with TextAnnotations(document) as ann_obj: _enrich_json_with_data(j_dic, ann_obj) return j_dic
def get_doc(tokens, text): doc = TextAnnotations(text=text) accu = [] for token in tokens: tag = token[1] if tag in "BSO" and accu: make_annotation(doc, accu) accu = [] if tag in "BESI": accu.append(token) if tag in "ES": make_annotation(doc, accu) accu = [] if accu: make_annotation(doc, accu) return doc
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) _delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def delete_span(collection, document, id): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + '.' + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() #TODO: Handle a failure to find it #XXX: Slow, O(2N) ann = ann_obj.get_ann_by_id(id) try: # Note: need to pass the tracker to del_annotation to track # recursive deletes. TODO: make usage consistent. ann_obj.del_annotation(ann, mods) try: trig = ann_obj.get_ann_by_id(ann.trigger) try: ann_obj.del_annotation(trig, mods) except DependingAnnotationDeleteError: # Someone else depended on that trigger pass except AttributeError: pass except DependingAnnotationDeleteError, e: Messager.error(e.html_error_str()) return { 'exception': True, } mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def convert(path, fname): from message import Messager try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir = path full_path = path_join(real_dir, fname) entity_ids = {} try: doc = folia.Document(file=full_path + ".xml") temp = open(full_path + ".ann", 'w') txt = open(full_path + ".txt", 'w') except IOError as e: Messager.error("IOError " + str(e)) return { 'result': False, } ann_obj = TextAnnotations(full_path) text, offsets = parse_text(doc) with SimpleAnnotations(ann_obj) as ann: add_entities(doc, ann, entity_ids, offsets, text) add_relations(doc, ann, entity_ids, offsets) add_attributes(doc, ann, entity_ids) add_comments(doc, ann, entity_ids) try: ann.folia = get_extra_info(path, fname) except: Messager.error("get_extra_info() from folia failed") ann.folia = {} txt.write(text) txt.close() #~ temp.write(str(ann)) #~ temp.close() make_conf_file(real_dir, ann) #return is needed for client, so it can see the function is done, this can take a few seconds return { 'result': True, }
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection # undo_resp = {} # TODO real_dir = real_directory(directory) # mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = urllib.parse.unquote(document) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error( 'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
try: found=fdoc[attr.target] if not any(fattr.cls == str(attr.value) and fattr.subset == attr.type for fattr in found.select(folia.Feature)) : print "error: not found attr" print attr print return False except KeyError: print "error: not found attr" print rel return False print "file "+path+doc+" is OK" return True if __name__ == '__main__': #convert("/home/sander/Documenten/Masterproef/pythontest/","folia") convert("/home/sander/Documents/Masterproef-v2/brat/data/brat_vb/sentiment/","sporza") ann = TextAnnotations("/home/sander/Documents/Masterproef-v2/brat/data/brat_vb/sentiment/sporza") #print ann._document_text #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","detijd_other_Bekaert_12-05-05") #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","detijd_other_bedrijfMulti_06-05-05") #~ compare("/home/sander/Downloads/brat/data/testen/vertaling/","1722_all_+DM") #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","testdoc") print "saved" #folia.validate("/home/sander/Documenten/Masterproef/pythontest/detijd_other_Bekaert_12-05-05.xml",deep="true")
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + '.' + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() # This can be an event or an equiv #TODO: Check for None! try: event_ann = ann_obj.get_ann_by_id(origin) # Try if it is an event arg_tup = (type, unicode(target)) if arg_tup in event_ann.args: before = unicode(event_ann) event_ann.args.remove(arg_tup) mods.change(before, event_ann) ''' if not event_ann.args: # It was the last argument tuple, remove it all try: ann_obj.del_annotation(event_ann) mods.deletion(event_ann) except DependingAnnotationDeleteError, e: #XXX: Old message api print 'Content-Type: application/json\n' print dumps(e.json_error_response()) return ''' else: # What we were to remove did not even exist in the first place pass except AttributeError: projectconf = ProjectConfiguration(real_dir) if projectconf.is_equiv_type(type): # It is an equiv then? #XXX: Slow hack! Should have a better accessor! O(eq_ann) for eq_ann in ann_obj.get_equivs(): # We don't assume that the ids only occur in one Equiv, we # keep on going since the data "could" be corrupted if (unicode(origin) in eq_ann.entities and unicode(target) in eq_ann.entities): before = unicode(eq_ann) eq_ann.entities.remove(unicode(origin)) eq_ann.entities.remove(unicode(target)) mods.change(before, eq_ann) if len(eq_ann.entities) < 2: # We need to delete this one try: ann_obj.del_annotation(eq_ann) mods.deletion(eq_ann) except DependingAnnotationDeleteError, e: #TODO: This should never happen, dep on equiv #print 'Content-Type: application/json\n' # TODO: Proper exception here! Messager.error(e.json_error_response()) return {} elif type in projectconf.get_relation_types(): for ann in ann_obj.get_relations(): if ann.type == type and ann.arg1 == origin and ann.arg2 == target: ann_obj.del_annotation(ann) mods.deletion(ann) break else:
def _create_span(collection, document, offsets, _type, attributes=None, normalizations=None, _id=None, comment=None): if _offset_overlaps(offsets): raise SpanOffsetOverlapError(offsets) directory = collection undo_resp = {} _attributes = _parse_attributes(attributes) _normalizations = _parse_span_normalizations(normalizations) #log_info('ATTR: %s' %(_attributes, )) real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() if _id is not None: # We are to edit an existing annotation tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf, _attributes, _type, undo_resp=undo_resp) else: # We are to create a new annotation tb_ann, e_ann = __create_span( ann_obj, mods, _type, offsets, txt_file_path, projectconf, _attributes) undo_resp['action'] = 'add_tb' if e_ann is not None: undo_resp['id'] = e_ann.id else: undo_resp['id'] = tb_ann.id # Determine which annotation attributes, normalizations, # comments etc. should be attached to. If there's an event, # attach to that; otherwise attach to the textbound. if e_ann is not None: # Assign to the event, not the trigger target_ann = e_ann else: target_ann = tb_ann # Set attributes _set_attributes(ann_obj, target_ann, _attributes, mods, undo_resp=undo_resp) # Set normalizations _set_normalizations(ann_obj, target_ann, _normalizations, mods, undo_resp=undo_resp) # Set comments if tb_ann is not None: _set_comments(ann_obj, target_ann, comment, mods, undo_resp=undo_resp) if tb_ann is not None: mods_json = mods.json_response() else: # Hack, probably we had a new-line in the span mods_json = {} Messager.error( 'Text span contained new-line, rejected', duration=3) if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def split_span(collection, document, args, id): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) # TODO don't know how to pass an array directly, so doing extra catenate # and split tosplit_args = json_loads(args) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() ann = ann_obj.get_ann_by_id(id) # currently only allowing splits for events if not isinstance(ann, EventAnnotation): raise AnnotationSplitError( "Cannot split an annotation of type %s" % ann.type) # group event arguments into ones that will be split on and # ones that will not, placing the former into a dict keyed by # the argument without trailing numbers (e.g. "Theme1" -> # "Theme") and the latter in a straight list. split_args = {} nonsplit_args = [] import re for arg, aid in ann.args: m = re.match(r'^(.*?)\d*$', arg) if m: arg = m.group(1) if arg in tosplit_args: if arg not in split_args: split_args[arg] = [] split_args[arg].append(aid) else: nonsplit_args.append((arg, aid)) # verify that split is possible for a in tosplit_args: acount = len(split_args.get(a, [])) if acount < 2: raise AnnotationSplitError( "Cannot split %s on %s: only %d %s arguments (need two or more)" % (ann.id, a, acount, a)) # create all combinations of the args on which to split argument_combos = [[]] for a in tosplit_args: new_combos = [] for aid in split_args[a]: for c in argument_combos: new_combos.append(c + [(a, aid)]) argument_combos = new_combos # create the new events (first combo will use the existing event) from copy import deepcopy new_events = [] for i, arg_combo in enumerate(argument_combos): # tweak args if i == 0: ann.args = nonsplit_args[:] + arg_combo else: newann = deepcopy(ann) # TODO: avoid hard-coding ID prefix newann.id = ann_obj.get_new_id("E") newann.args = nonsplit_args[:] + arg_combo ann_obj.add_annotation(newann) new_events.append(newann) mods.addition(newann) # then, go through all the annotations referencing the original # event, and create appropriate copies for a in ann_obj: soft_deps, hard_deps = a.get_deps() refs = soft_deps | hard_deps if ann.id in refs: # Referenced; make duplicates appropriately if isinstance(a, EventAnnotation): # go through args and make copies for referencing new_args = [] for arg, aid in a.args: if aid == ann.id: for newe in new_events: new_args.append((arg, newe.id)) a.args.extend(new_args) elif isinstance(a, AttributeAnnotation): for newe in new_events: newmod = deepcopy(a) newmod.target = newe.id # TODO: avoid hard-coding ID prefix newmod.id = ann_obj.get_new_id("A") ann_obj.add_annotation(newmod) mods.addition(newmod) elif isinstance(a, BinaryRelationAnnotation): # TODO raise AnnotationSplitError( "Cannot adjust annotation referencing split: not implemented for relations! (WARNING: annotations may be in inconsistent state, please reload!) (Please complain to the developers to fix this!)") elif isinstance(a, OnelineCommentAnnotation): for newe in new_events: newcomm = deepcopy(a) newcomm.target = newe.id # TODO: avoid hard-coding ID prefix newcomm.id = ann_obj.get_new_id("#") ann_obj.add_annotation(newcomm) mods.addition(newcomm) elif isinstance(a, NormalizationAnnotation): for newe in new_events: newnorm = deepcopy(a) newnorm.target = newe.id # TODO: avoid hard-coding ID prefix newnorm.id = ann_obj.get_new_id("N") ann_obj.add_annotation(newnorm) mods.addition(newnorm) else: raise AnnotationSplitError( "Cannot adjust annotation referencing split: not implemented for %s! (Please complain to the lazy developers to fix this!)" % a.__class__) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) print("tagger", tagger, file=sys.stderr) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr) # print("tagger_token", tagger_token, file=sys.stderr) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: # print("ann_obj", document, file=sys.stderr) url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection # print("HTTPConnection", HTTPConnection, file=sys.stderr) elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: # Note: Trout slapping for anyone sending Unicode objects here data = str(path_join( real_directory(collection), document)) + "#*^$#" + ann_obj.get_document_text() data = data.encode('utf-8') # print("data", type(data),data, file=sys.stderr) # print("data", ann_obj, file=sys.stderr) req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) # httpConnection = http.client.HTTPConnection(url_soup.netloc) # httpConnection.request('GET', str(service_url), headers=req_headers) # response = httpConnection.getresponse() except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # print("resp-------------", resp.read(), file=sys.stderr) # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) # print("json_resp", json_resp, file=sys.stderr) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr) for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] # print("json_resp.items:::::::::::::", offsets, file=sys.stderr) assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr) _id = ann_obj.get_new_id('T') print("_id", _id, file=sys.stderr) cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp