def get_article_series( dir="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/BMJ_Case_Reports/", dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/dataset1.0.json" ): """ :return: {article_id: series_name} """ from describe_data import DATA_KEY, SOURCE_KEY series = {} for f in get_file_list(dir): if f.endswith(".full"): ls = open(f).readlines() for c, l in enumerate(ls): if '<ul class="series-titles">' in l: s = ls[c + 1].strip() if s.startswith("<li>") and s.endswith("</li>"): series[os.path.basename(f)[:-5]] = s[4:-5].lower() break # filter out those not in json dataset from describe_data import get_doc_ids doc_ids = get_doc_ids(dataset_file) return {doc_id: s for doc_id, s in series.items() if doc_id in doc_ids}
def get_article_specialty( dir="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/BMJ_Case_Reports/", dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/dataset1.0.json" ): """ :return: {article_id: {spec1, ...}} """ specs = {} for f in get_file_list(dir): if f.endswith(".full"): ls = open(f).readlines() for c, l in enumerate(ls): if '<meta content="' in l and 'name="DC.subject"' in l: # or ('name="DC.subject"' in ls[c+1])): match = re.search( '<meta content=\"(.*)\" name=\"DC\.subject', l) if match: specialties = match.group(1) specs[os.path.basename(f)[:-5]] = set( specialties.lower().split("; ")) break elif '<meta content="' in l and ('name="DC.subject"' not in l and 'name="DC.subject"' in ls[c + 1]): match = re.search('<meta content=\"(.*)\"', l) if match: specialties = match.group(1) specs[os.path.basename(f)[:-5]] = set( specialties.lower().split("; ")) break # filter out those not in json dataset from describe_data import get_doc_ids doc_ids = get_doc_ids(dataset_file) #print(len(specs)) #print(Counter(i for s in specs.values() for i in s)) return {doc_id: s for doc_id, s in specs.items() if doc_id in doc_ids}