def annotate_doc(pdf_file_path, ontologies): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() db = DBConnect() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in annotation process', 'exception': '', 'data': '' } db.insert_log(log) return ontologies = ",".join(ontologies) annotations = [] text = unidecode(text.decode('utf8')) text = ' '.join(text.split()) # post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, # display_links='true', display_context='false', minimum_match_length='3', # exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, display_links='true', display_context='false', minimum_match_length='3', exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') try: response = requests.post(settings.ANNOTATOR_URL, post_data) json_results = json.loads(response.text) for result in json_results: for annotation in result['annotations']: context_begin = annotation['from'] if annotation['from'] - 40 < 1 else annotation['from'] - 40 context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40 record = { 'file_name': pdf_file_path.encode('utf-8'), 'bio_class_id': result['annotatedClass']['@id'], 'bio_ontology_id': result['annotatedClass']['links']['ontology'], 'text': u'' + annotation['text'].encode('utf-8'), 'match_type': annotation['matchType'], 'context': u''+text[context_begin:context_end] } annotations.append(record) db.insert_annotations(annotations) return except (ValueError, IndexError, KeyError) as e: print e log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Bad response from Bioportal Annotator', 'exception': str(e), 'data': '' } db.insert_log(log) return