def word_count_dir(dir_path): file_names = [] db = DBConnect() dictionary = db.select_tags() for root, dirs, files in walk(dir_path): for name in files: if 'pdf' in name or 'htm' in name or 'txt' in name: file_names.append(path.join(root, name)) else: continue click.secho("Counting words for {} PDF documents.".format(len(file_names)) , fg='blue') n_jobs = 20 dir_counters = Parallel(n_jobs=n_jobs)(delayed(word_count)(file_name) for file_name in file_names) total = sum(dir_counters, Counter()) for word in dictionary: if word in total: db.insert_word({'word': word, 'frequency': total[word]}) return total
def annotate_doc(pdf_file_path, ontologies): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() db = DBConnect() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in annotation process', 'exception': '', 'data': '' } db.insert_log(log) return ontologies = ",".join(ontologies) annotations = [] text = unidecode(text.decode('utf8')) text = ' '.join(text.split()) # post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, # display_links='true', display_context='false', minimum_match_length='3', # exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, display_links='true', display_context='false', minimum_match_length='3', exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') try: response = requests.post(settings.ANNOTATOR_URL, post_data) json_results = json.loads(response.text) for result in json_results: for annotation in result['annotations']: context_begin = annotation['from'] if annotation['from'] - 40 < 1 else annotation['from'] - 40 context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40 record = { 'file_name': pdf_file_path.encode('utf-8'), 'bio_class_id': result['annotatedClass']['@id'], 'bio_ontology_id': result['annotatedClass']['links']['ontology'], 'text': u'' + annotation['text'].encode('utf-8'), 'match_type': annotation['matchType'], 'context': u''+text[context_begin:context_end] } annotations.append(record) db.insert_annotations(annotations) return except (ValueError, IndexError, KeyError) as e: print e log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Bad response from Bioportal Annotator', 'exception': str(e), 'data': '' } db.insert_log(log) return
def word_count(pdf_file_path): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in recommendation process', 'exception': '', 'data': '' } db = DBConnect() db.insert_log(log) return [] text = unicode(text, 'utf-8') words = word_tokenize(text.upper()) c = Counter() c.update(words) return c
def get_recommendations_dir(dir_path): file_names = [] ontologies = {} db = DBConnect() for root, dirs, files in walk(dir_path): for name in files: if 'pdf' in name or 'htm' in name or 'txt' in name: file_names.append(path.join(root, name)) else: continue click.secho("Getting Bioportal recommendations for {} PDF documents.".format(len(file_names)), fg='blue') n_jobs = 20 file_recommendations = Parallel(n_jobs=n_jobs)(delayed(get_recommendations_file)(file_name) for file_name in file_names) for recommendation in file_recommendations: for ontology in recommendation: if ontology['acronym'] in ontologies: ontologies[ontology['acronym']]['frequency'] += 1 else: ontologies[ontology['acronym']] = {'frequency': 1, 'id': ontology['id']} db.insert_ontologies(ontologies) return list(ontologies.keys())
def get_recommendations_file(pdf_file_path): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in recommendation process', 'exception': '', 'data': '' } db = DBConnect() db.insert_log(log) return [] abstract_index = text.find('abstract') abstract_index += text.find('ABSTRACT') abstract_index += text.find('Abstract') abstract_index = 0 if abstract_index < 0 else abstract_index text = unidecode(text.decode('utf8')) text = ' '.join(text.split()) text = text[abstract_index:abstract_index+500] if len(text) > 500 else text post_data = dict(apikey=settings.BIOPORTAL_API_KEY, input=text, include='ontologies', display_links='false', output_type='2', display_context='false', wc='0.15', ws='1.0', wa='1.0', wd='0.5') try: response = requests.post(settings.RECOMMENDER_URL, post_data) json_results = json.loads(response.text) best_ontology_set = json_results[0]['ontologies'] if len(json_results) > 0 else [] return [{'acronym': ontology['acronym'], 'id': ontology['@id']} for ontology in best_ontology_set] except (ValueError, IndexError, KeyError) as e: log = { 'file_name': '', 'error': 'Bad response from Bioportal Recommender:', 'exception': str(e), 'data': '' } db = DBConnect() db.insert_log(log) return []
def init(): db = DBConnect() db.init_model()