def get_all_pmids(): pmc_pmids = get_pmc_results(pmc_query) portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) return (all_pmids_after_filter)
def get_all_pmids(): print "Getting PMC results" pmc_pmids = get_pmc_results(pmc_query) print "Getting other portal results" portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids print "Filtering in PubMed" all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) return(all_pmids_after_filter)
def get_all_pmids(): print "Getting PMC results" pmc_pmids = get_pmc_results(pmc_query) print "Getting other portal results" portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids print "Filtering in PubMed" all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) return (all_pmids_after_filter)
def has_data_submission(query_pmids): """Returns a list of flags (0 or 1) indicating whether the PubMed IDs are listed as a citation in GEO. """ if not query_pmids: return([]) filtered_pmids = filter_pmids(query_pmids, "pubmed_gds[filter]") pmid_passes_filter = [(pmid in filtered_pmids) for pmid in query_pmids] flag_pmid_passes_filter = map_booleans_to_flags(pmid_passes_filter) return(flag_pmid_passes_filter)
def has_data_submission(query_pmids): """Returns a list of flags (0 or 1) indicating whether the PubMed IDs are listed as a citation in GEO. """ if not query_pmids: return ([]) filtered_pmids = filter_pmids(query_pmids, "pubmed_gds[filter]") pmid_passes_filter = [(pmid in filtered_pmids) for pmid in query_pmids] flag_pmid_passes_filter = map_booleans_to_flags(pmid_passes_filter) return (flag_pmid_passes_filter)
def base(): base_query = """(("humans"[mesh] AND "magnetic resonance imaging"[mesh] AND Journal Article[ptyp] NOT "mental disorders"[mesh]) NOT (Editorial[ptyp] OR Letter[ptyp] OR Meta-Analysis[ptyp] OR Practice Guideline[ptyp] OR Review[ptyp] OR Case Reports[ptyp] OR Comment[ptyp] OR Corrected and Republished Article[ptyp]) AND ("1991"[PDAT] : "2001"[PDAT]) AND English[lang])""" query_for_1or2s = """(Movements OR Finger OR (Somatosensory Cortex[mesh]) OR somatosensory[Title/Abstract] OR (Cerebrovascular Circulation[mesh]) OR "primary motor"[Title/Abstract] OR "primary visual"[Title/Abstract] OR sensorimotor[Title/Abstract] OR "motor area"[Title/Abstract] OR oxygenation[Title/Abstract] OR (Motor Cortex[mesh]) OR "visual cortex"[Title/Abstract] OR (Acoustic Stimulation[mesh]))""" query_for_3or4s = """((Temporal Lobe[mesh]) OR (Prefrontal Cortex[mesh]) OR (Pattern Recognition, Visual[mesh]) OR (Visual Perception[mesh]) OR semantic[Title/Abstract] OR verbal[Title/Abstract] OR "left inferior"[Title/Abstract] OR retrieval[Title/Abstract] OR memory[Title/Abstract] OR language[Title/Abstract] OR Memory[mesh])""" #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR affective[Title/Abstract] OR Affect[mesh] OR dissociation[Title/Abstract] OR Arousal[mesh] OR (Decision Making[mesh]) OR happy[Title/Abstract] OR attentional[Title/Abstract] or networks[Title/Abstract] or Cognition[mesh])""" #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR affective[Title/Abstract] OR Affect[mesh] OR Arousal[mesh] OR (Decision Making[mesh]) OR happy[Title/Abstract] OR attentional[Title/Abstract] or Cognition[mesh])""" query_for_5or6s = """(Amygdala OR Emotions[mesh] OR emotion*[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR Affect[mesh]""" #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR Affect[mesh] OR Arousal[mesh] OR (Decision Making[mesh])""" query_to_try = base_query + " AND " + query_for_5or6s# + " NOT (" + query_for_1or2s + " OR " + query_for_3or4s + ")" #(pr, re) = calc_query_performance(A, query_to_try, A+B) (pr, re) = calc_query_performance(A, query_for_5or6s, A+B) print "Precision:", round(pr, 2) print "Recall:", round(re, 2) #filtered = pubmed.filter_pmids(B, "Emotions[mesh]") filtered = pubmed.filter_pmids(B, query_for_5or6s) for pmid in filtered: # print pmid pass if (False): pmids = pubmed.filter_pmids(B, "Visual Cortex[mesh]") for pmid in pmids: print pmid print "\n" for pmid in pmids: print pmid, if pmid in healthy_1991to2001_3or4: print "3or4", if pmid in healthy_1991to2001_5or6: print "5or6", print
def calc_query_performance(gold_pmids, query, recall_filter_pmids=None): print query if not recall_filter_pmids: recall_filter_pmids = gold_pmids query_finds_recall_filter_pmids = pubmed.filter_pmids(recall_filter_pmids, query) query_finds_recall_filter_pmids_set = set(query_finds_recall_filter_pmids) gold_set = set(gold_pmids) print len(gold_pmids) print len(recall_filter_pmids) print len(query_finds_recall_filter_pmids) print len(gold_set & query_finds_recall_filter_pmids_set) print len(query_finds_recall_filter_pmids_set) precision = float(len(gold_set & query_finds_recall_filter_pmids_set)) / len(query_finds_recall_filter_pmids_set) recall = float(len(gold_set & query_finds_recall_filter_pmids_set)) / len(gold_pmids) return(precision, recall)
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))') basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])') has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids] print word_filters reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid] print full_dict[id+reuse_pmcid] return(full_dict)
def get_all_pmids(): pmc_pmids = get_pmc_results(pmc_query) portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) return(all_pmids_after_filter)
def get_all_ae_pmids(query_pmids, complete_ae_pmids): all_ae_pmids = [pmid for pmid in query_pmids if pmid in complete_ae_pmids] return(all_ae_pmids) def get_all_shared_pmids(gds_pmids, ae_pmids): all_shared_pmids = list(set(gds_pmids + ae_pmids)) return(all_shared_pmids) if (False): pmids = get_all_pmids() print(len(pmids)) pmc_pmids = get_pmc_results(pmc_query) portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids pmids = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) gds_pmids = get_all_gds_pmids(pmids) complete_ae_pmids = get_complete_arrayexpress_pmids() ae_pmids = get_all_ae_pmids(pmids, complete_ae_pmids) shared_pmids = get_all_shared_pmids(gds_pmids, ae_pmids) shared_proportion = (len(shared_pmids)+0.0)/len(pmids) print shared_proportion # Too many duplicates of this. Need to refactor duplicate code! def _map_booleans_to_flags(list_of_True_False): mapping = {True:'1', False:'0'} list_of_flags = [mapping[i] for i in list_of_True_False] return(list_of_flags) # Too many duplicates of this. Need to refactor duplicate code! def get_is_in_flags(query_pmids, base_pmids):
#A = healthy_1991to2008_5or6 #B = healthy_2002to2008_not5or6 #A = healthy_1991to2001_3or4 #B = healthy_1991to2001_1or2 + healthy_1991to2001_5or6 # + healthy_1991to2001_7 #A = healthy_1991to2001_5or6 #B = healthy_1991to2001_1or2 + healthy_1991to2001_3or4 from data import neuroethicslike #A = neuroethicslike.pubmed_neurethicslike_query_results #A = neuroethicslike.pubmed_fmri_neuroethicslike_query_results base_query = """("fmri"[text] OR "magnetic resonance imaging"[mesh]) AND ((neurosciences[mesh] OR neuroscience[Title/Abstract] OR neurology[mesh]) AND (ethics[sh] OR ethical[Title/Abstract] OR "bioethical issues"[mesh] OR "ethics, medical"[mesh] OR "legislation and jurisprudence"[Subheading])) OR neuroethic*[Title/Abstract]""" base = pubmed.search(base_query) A = pubmed.filter_pmids(base, "Personal Autonomy") dist = coveyquery.get_mesh_frequency_distributions(A, getter=pubmed.mesh_basic) print "\n\nTop list for (A, getter=pubmed.mesh_basic):" coveyquery.print_frequency_proportion(A, dist, 150) if False: dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.article_title) print "\n\nTop list for (A, getter=pubmed.article_title):" coveyquery.print_frequency_proportion(A, dist, 30) dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.abstract) print "\n\nTop list for (A, getter=pubmed.abstract):" coveyquery.print_frequency_proportion(A, dist, 30) dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.title_and_abstract)
def get_PMIDs_from_DOIs(DOIs): annotated_DOIs = [doi + "[doi]" for doi in DOIs] DOI_query_string = " OR ".join(annotated_DOIs) PMIDs = filter_pmids("1", DOI_query_string) return (PMIDs)
return (all_ae_pmids) def get_all_shared_pmids(gds_pmids, ae_pmids): all_shared_pmids = list(set(gds_pmids + ae_pmids)) return (all_shared_pmids) if (False): pmids = get_all_pmids() print(len(pmids)) pmc_pmids = get_pmc_results(pmc_query) portal_pmids = datasources.portal_pmids.get_all_pmids() all_pmids_before_filter = pmc_pmids + portal_pmids pmids = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query) gds_pmids = get_all_gds_pmids(pmids) complete_ae_pmids = get_complete_arrayexpress_pmids() ae_pmids = get_all_ae_pmids(pmids, complete_ae_pmids) shared_pmids = get_all_shared_pmids(gds_pmids, ae_pmids) shared_proportion = (len(shared_pmids) + 0.0) / len(pmids) print shared_proportion # Too many duplicates of this. Need to refactor duplicate code! def _map_booleans_to_flags(list_of_True_False): mapping = {True: '1', False: '0'} list_of_flags = [mapping[i] for i in list_of_True_False] return (list_of_flags)
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags=re.IGNORECASE | re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))' ) basic_reuse_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])' ) has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [ ";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids ] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [ ";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids ] print word_filters reuse_pmid_dict = defaultdict( tuple, zip( all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict( tuple, zip( all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id + reuse_pmcid] = vals + ( "|", ) + reuse_pmcid_dict[reuse_pmcid] + ( "|", ) + reuse_pmid_dict[reuse_pmid] print full_dict[id + reuse_pmcid] return (full_dict)
def get_all_gds_pmids(pmids): all_pmids_after_filter = pubmed.filter_pmids(pmids, "pubmed_gds[filter]") return(all_pmids_after_filter)
def get_PMIDs_from_DOIs(DOIs): annotated_DOIs = [doi + "[doi]" for doi in DOIs] DOI_query_string = " OR ".join(annotated_DOIs) PMIDs = filter_pmids("1", DOI_query_string) return(PMIDs)
def get_all_gds_pmids(pmids): all_pmids_after_filter = pubmed.filter_pmids(pmids, "pubmed_gds[filter]") return (all_pmids_after_filter)