def calculate_relevance_scores(query: dict, json_contents: dict, ef: EntityInfo): scored_documents = {} for index, reply in enumerate(json_contents['replies']): if index == 0: continue try: is_same_category = query['category'] == json_contents['commonCategory'] is_same_thread = query['threadId'] == json_contents['threadId'] number_votes = reply['postThankYouCount'] + reply['postHelpfulCount'] + reply['postSupportCount'] text_length = len(reply['postText']) is_doctor_reply = reply['mdReply'] number_medical_entities = 0 number_same_entities = 0 if 'annotationsFull' in reply: annotations = set(get_entity_code(entity) for entity in reply['annotationsFull']) for entity in annotations: if entity in ef.informative_entities: number_medical_entities += 1 if entity in query['annotations']: number_same_entities += 1 elif entity not in ef.other_entities and ef.is_informative_entity(entity): number_medical_entities += 1 ef.update_informative_list(entity) if entity in query['annotations']: number_same_entities += 1 elif entity not in ef.other_entities: ef.update_other_list(entity) document_score = scoring_function(is_doctor_reply, number_votes, number_medical_entities, number_same_entities, text_length, is_same_category, is_same_thread) document_name = "EF" + str(json_contents['threadId']) + "r" + str(index) scored_documents[document_name] = document_score except KeyError as ke: if 'threadId' in json_contents: print("KeyError in file: " + json_contents['threadId']) else: print("KeyError in unknown file.") return None return scored_documents
def extract_query(json_contents: dict, ef: EntityInfo): if json_contents['replyCount'] < 2: return None query = {'category': json_contents['commonCategory']} annotations = [] if 'annotationsFull' in json_contents['replies'][0]: for annotation in json_contents['replies'][0]['annotationsFull']: entity = get_entity_code(annotation) if ef.is_informative_entity(entity): annotations.append(entity) query['annotations'] = annotations else: query['annotations'] = None post_text = json_contents['replies'][0]['postText'] length = len(post_text) query['length'] = length query['text'] = post_text.replace('\t', '') query['username'] = json_contents['replies'][0]['createdBy']['username'] query['threadId'] = json_contents['threadId'] if 'annotatedOriginCategory' in json_contents: query['annotatedOriginCategory'] = json_contents['annotatedOriginCategory'] return query
def make_annotation_types(annotations: list, ef: EntityInfo) -> list: """ Extract information the type of annotation for a given entity code (e.g. C124894) @param annotations: List of annotations to get information about. @param ef: EntityInfo class, used to extract information about informative entities. @return: Number of each entity type (uses the list of informative entity types defined in relevanceRanking.connect_to_kb) """ types_counts = {} for entity in informative_entity_types: types_counts[entity] = 0 if annotations is not None: for annotation in annotations: try: types = ef.get_entity_types(annotation) except ValueError as ve: print("Value could not be found: " + str(annotation)) continue for entity_type in types: types_counts[entity_type] += 1 output_counts = [] for entity in informative_entity_types: output_counts.append(types_counts[entity]) return output_counts
def produce_training_data(scores: dict, queries: dict, documents: dict, ef: EntityInfo) -> (list, list): training_data = [] target_values = [] for query_id in scores: for document_id in scores[query_id]: if document_id in documents and query_id in queries: document = documents[document_id] query = queries[query_id] else: continue is_same_category = int(query['category'] == document['category']) is_same_thread = int(query['threadId'] == document['threadId']) number_votes_t = document['votes-t'] number_votes_s = document['votes-s'] number_votes_h = document['votes-h'] is_doctor_reply = int(document['mdReply']) text_length_factor = document['postLength'] // 150 number_medical_entities = 0 number_same_entities = 0 annotations = set(document['annotations']) for entity in annotations: try: if ef.is_informative_entity(entity): number_medical_entities += 1 if entity in query['annotations']: number_same_entities += 1 except ValueError as ve: print("The entity " + entity + " could not be evaluated. Skipping...") continue training_item = [ is_doctor_reply, number_votes_h, number_votes_s, number_votes_t, number_medical_entities, number_same_entities, text_length_factor, is_same_category, is_same_thread ] target_item = [scores[query_id][document_id]] training_data.append(training_item) target_values.append(target_item) return training_data, target_values
def main(): ef = EntityInfo() bm25_scores = read_score_file(starting_file) full_queries = make_queries(list(bm25_scores.keys()), ef) document_ids = set() for query in bm25_scores: for doc_id in bm25_scores[query].keys(): document_ids.add(doc_id) full_documents = find_documents(document_ids) training_data, target_values = produce_training_data( bm25_scores, full_queries, full_documents, ef) write_out_training_data(output_directory, training_data, target_values)
def main(): entity_info = EntityInfo() queries = make_queries(starting_directory, entity_info) with open(os.path.join(output_directory, "queries.json"), "w+", encoding="utf8") as output_file: json_queries = [] for query in queries: json_query = query json_query['annotations'] = list(query['annotations']) json_queries.append(json_query) json.dump(json_queries, output_file) print("Wrote queries to file.") loop_all_documents(queries, starting_directory, entity_info)
for result in contents['hits']['hits']: titles.append(result['_source']['Title']) print(len(titles)) # print(json.dumps(titles)) mayo_namemap = {} with open("d:/onedrive/documents/ab germany/health_data/mayo_map.txt", "r", encoding="utf8", errors='ignore') as file: for line in file: category, mayo_name = line.split(";") category = category.lower() mayo_name = mayo_name.replace('\n', '').lower() mayo_namemap[category] = mayo_name ef = EntityInfo() # print(contents['hits']['hits'][1]['_source']['aida']['annotatedText']) entities_by_category = {} print("Number of titles: " + str(len(mayo_namemap))) counter = 0 print('Values:') print(list(mayo_namemap.values())) for result in contents['hits']['hits']: print(result['_source']['Title']) if result['_source']['Title'].find('Abdominal') > 0:
encoding="utf8") if WRITE_DOCUMENTS: data_file = open(os.path.join(output_directory_data, "data0.trac"), "w+", encoding="utf8") processed_files = 0 error_files = 0 documents_written = 0 queries_written = 0 pattern = re.compile('C[0-9]{3,}') pattern_long = re.compile("\[\[C[0-9]+\|[\w\s]{1,}\]\]") ef = EntityInfo() # Walk through the input JSON data directory. for root, dirs, files in os.walk(starting_directory): for file_name in files: try: with open(os.path.join(root, file_name), "r", encoding="utf8") as file: content = json.loads(file.read()) # Write out progress every bunch of files. processed_files += 1 if processed_files % 100 == 0: print("Processed files: " + str(processed_files)) if WRITE_QUERIES:
''' * Created by filip on 08/11/2019 ''' from prepareTrainingData.EntityInfo.entities_info import EntityInfo from prepareTrainingData.EntityInfo.connect_to_kb import get_entity_types informative_entities = set() with open("/home/fadamik/Documents/informative_nodes.txt", "r", encoding="utf8") as file: for line in file: informative_entities.add(line.replace("\n", "")) ei = EntityInfo() for entity in informative_entities: types = get_entity_types(entity, ei.get_es()) ei.update_entity_types(entity, types) ei.write_out_entity_types()
processed_files = 0 error_files = 0 documents_written = 0 queries_written = 0 data_file = open(os.path.join(output_directory_data, "data1.trac"), "w+", encoding="utf8") maps_file_2 = open(os.path.join(output_directory_maps, "topic-map2.txt"), "w+", encoding="utf8") maps_file_3 = open(os.path.join(output_directory_maps, "topic-map3.txt"), "w+", encoding="utf8") query_file = open(os.path.join(output_directory_queries, "queries0.txt"), "w+", encoding="utf8") pattern = re.compile('C[0-9]{3,}') pattern_long = re.compile("\[\[C[0-9]+\|[\w\s]{1,}\]\]") ef = EntityInfo() for root, dirs, files in os.walk(starting_directory): for file_name in files: try: file = open(os.path.join(root, file_name), "r", encoding="utf8") content = json.loads(file.read()) file.close() query = content['replies'][0] topic_no = str(content['threadId']) entities_string = "" if 'annotationsFull' in query: for annotation in query['annotationsFull']:
def produce_training_data(selected: list, queries: dict, documents: dict, ef: EntityInfo) -> list: """ Create training data by reading the full query, extracting document features in respect to the query and reading @param selected: List of selected queries @param queries: Full details of the selected queries @param documents: Relevant details of the selected documents. @param ef: EntityInfo class, used to extract information about informative entities. @return: List of individual training pairs with all details necessary for Snorkel processing. """ training_data = [] entity_list = ef.get_entity_relations() for query_id in selected: if query_id in queries: query = queries[query_id] else: continue if 'annotatedOriginCategory' in query: category_annotation = get_entity_code( query['annotatedOriginCategory']) if category_annotation is not None and query[ 'annotations'] is not None: query['annotations'].append(category_annotation) elif category_annotation is not None and query[ 'annotations'] is None: query['annotations'] = [category_annotation] for document_id in selected[query_id]: if document_id in documents: document = documents[document_id] else: break query_category = query['category'] query_thread = query['threadId'] query_text = query['text'].replace('\t', '') query_username = query['username'] query_annotation_count = "" for index, count in enumerate( make_annotation_types(query['annotations'], ef)): if index > 0: query_annotation_count = query_annotation_count + '\t' query_annotation_count = query_annotation_count + str(count) document_annotations = '' for index, annotation in enumerate(document['annotations']): if index > 0: document_annotations += ";" document_annotations += annotation if query['annotations'] is not None: query_annotations = '' relationships = '' for index, annotation in enumerate(query['annotations']): if index > 0: query_annotations += ";" query_annotations += annotation if annotation in entity_list: for document_annotation in document['annotations']: if document_annotation in entity_list[annotation]: if not relationships == '': relationships += ',' relationships += entity_list[annotation][ document_annotation] else: query_annotations = None relationships = None document_category = document['category'] document_thread = document['threadId'] document_user_status = document['userStatus'] document_username = document['username'] document_number_votes_t = document['votes-t'] document_number_votes_s = document['votes-s'] document_number_votes_h = document['votes-h'] document_text = document['document-text'].replace('\t', '').replace( '\n', '') document_is_doctor_reply = document['mdReply'] document_annotation_count = "" for index, count in enumerate( make_annotation_types(document['annotations'], ef)): if index > 0: document_annotation_count = document_annotation_count + '\t' document_annotation_count = document_annotation_count + str( count) # Two 0s at the end represent a BM25 score (non-relevant queries don't have it) and BM25 relevance (0 for # non-relevant) training_item = [ query_category, query_thread, query_id, query_text, query_username, query_annotations, query_annotation_count, document_category, document_id, document_thread, document_text, document_is_doctor_reply, document_number_votes_h, document_number_votes_s, document_number_votes_t, document_username, document_user_status, document_annotations, document_annotation_count, relationships, '0', '0' ] training_data.append(training_item) return training_data