def extract_from_mrconso( mrconso_path, mrsty_path, opts, mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY): start = time.time() print('loading semantic types...', end=' ') sys.stdout.flush() sem_types = get_semantic_types(mrsty_path, mrsty_header) print('done in {:.2f} s'.format(time.time() - start)) start = time.time() mrconso_iterator = get_mrconso_iterator( mrconso_path, mrconso_header, opts.language ) total = countlines(mrconso_path) processed = set() i = 0 for content in mrconso_iterator: i += 1 if i % 100000 == 0: delta = time.time() - start status = ( '{:,} in {:.2f} s ({:.2%}, {:.1e} s / term)' ''.format(i, delta, i / total, delta / i if i > 0 else 0) ) print(status) concept_text = content['str'].strip() cui = content['cui'] preferred = 1 if content['ispref'] else 0 if opts.lowercase: concept_text = concept_text.lower() if opts.normalize_unicode: concept_text = unidecode(concept_text) if (cui, concept_text) in processed: continue else: processed.add((cui, concept_text)) yield (concept_text, cui, sem_types[cui], preferred) delta = time.time() - start status = ( '\nCOMPLETED: {:,} in {:.2f} s ({:.1e} s / term)' ''.format(i, delta, i / total, delta / i if i > 0 else 0) ) print(status)
def extract_from_mrconso( mrconso_path, mrsty_path, opts, mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY): start = time.time() print('loading semantic types...', end=' ') sys.stdout.flush() sem_types = get_semantic_types(mrsty_path, mrsty_header) print('done in {:.2f} s'.format(time.time() - start)) start = time.time() mrconso_iterator = get_mrconso_iterator( mrconso_path, mrconso_header, opts.language ) total = countlines(mrconso_path) processed = set() yielded = set() put_aside = set() code_concepts = 0 i = 0 added_from_suppressed = 0 for content in mrconso_iterator: i += 1 if i % 100000 == 0: delta = time.time() - start status = ( '{:,} in {:.2f} s ({:.2%}, {:.2f} ms / term)' ''.format(i, delta, i / total, delta / i * 1000 if i > 0 else 0) ) print(status) concept_text = content['str'].strip() cui = content['cui'] code = content['code'] preferred = 1 if content['ispref'] == 'Y' else 0 suppressed = content['suppress'] if opts.lowercase: concept_text = concept_text.lower() if opts.normalize_unicode: concept_text = unidecode(concept_text) code_concept_yielded = (code, concept_text) in processed if code_concept_yielded: continue # check that code is not obsolete before adding it to iterator # if code is obsolete, check whether it's been yielded earlier as active # if it was, then term can be added to the database # if not, then store it till the end, recheck then and yield if ok code_yielded = code in yielded if suppressed != 'N': if not code_yielded: #print((concept_text, code, cui, sem_types[cui], preferred)) put_aside.add((concept_text, code, cui, tuple(sem_types[cui]), preferred)) continue else: added_from_suppressed += 1 processed.add((code, concept_text)) code_concepts += 1 if not code_yielded: yielded.add(code) yield (concept_text, code, cui, sem_types[cui], preferred) # Now check the put aside codes if they were yielded as active # If yes, yield them print("Concepts put aside: " + str(len(put_aside))) rejected = 0 with open('codes_rejected.txt', 'w') as rejected_file: with open('codes_added.txt', 'w') as added_file: for (concept_text, code, cui, semtypes, preferred) in put_aside: if code in yielded and (code, concept_text) not in processed: processed.add((code, concept_text)) code_concepts += 1 yield (concept_text, code, cui, list(semtypes), preferred) added_from_suppressed += 1 added_file.write(str((concept_text, code, cui, preferred)) +"\n") continue rejected += 1 rejected_file.write(str((concept_text, code, cui, preferred)) +"\n") print("Terms added from suppressed: " + str(added_from_suppressed)) print("Terms rejected from put aside: " + str(rejected)) print("Total number of code_concept combinations added to db: " + str(code_concepts)) delta = time.time() - start status = ( '\nCOMPLETED: {:,} in {:,} s ({:.2f} ms / term)' ''.format(i, delta, delta / i * 1000 if i > 0 else 0) ) print(status)