Example #1
0
def get_all_pmids():
    pmc_pmids = get_pmc_results(pmc_query)
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter,
                                                 pubmed_query)
    return (all_pmids_after_filter)
def get_all_pmids():
    print "Getting PMC results"
    pmc_pmids = get_pmc_results(pmc_query)
    print "Getting other portal results"
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    print "Filtering in PubMed"
    all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query)
    return(all_pmids_after_filter)
Example #3
0
def get_all_pmids():
    print "Getting PMC results"
    pmc_pmids = get_pmc_results(pmc_query)
    print "Getting other portal results"
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    print "Filtering in PubMed"
    all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter,
                                                 pubmed_query)
    return (all_pmids_after_filter)
Example #4
0
File: geo.py Project: atiw003/pypub
def has_data_submission(query_pmids):
    """Returns a list of flags (0 or 1) indicating whether the PubMed IDs are listed as
    a citation in GEO.  
    """
    if not query_pmids:
        return([])
    filtered_pmids = filter_pmids(query_pmids, "pubmed_gds[filter]")
    pmid_passes_filter = [(pmid in filtered_pmids) for pmid in query_pmids]   
    flag_pmid_passes_filter = map_booleans_to_flags(pmid_passes_filter)
    return(flag_pmid_passes_filter)
Example #5
0
def has_data_submission(query_pmids):
    """Returns a list of flags (0 or 1) indicating whether the PubMed IDs are listed as
    a citation in GEO.  
    """
    if not query_pmids:
        return ([])
    filtered_pmids = filter_pmids(query_pmids, "pubmed_gds[filter]")
    pmid_passes_filter = [(pmid in filtered_pmids) for pmid in query_pmids]
    flag_pmid_passes_filter = map_booleans_to_flags(pmid_passes_filter)
    return (flag_pmid_passes_filter)
Example #6
0
def base():
    base_query = """(("humans"[mesh] AND "magnetic resonance imaging"[mesh] AND Journal Article[ptyp] NOT "mental disorders"[mesh]) 
    NOT (Editorial[ptyp] OR Letter[ptyp] OR Meta-Analysis[ptyp] OR Practice Guideline[ptyp] OR Review[ptyp] OR Case Reports[ptyp] OR Comment[ptyp] OR Corrected and Republished Article[ptyp])
    AND ("1991"[PDAT] : "2001"[PDAT])
    AND English[lang])"""

    query_for_1or2s = """(Movements OR Finger OR (Somatosensory Cortex[mesh]) OR somatosensory[Title/Abstract] OR (Cerebrovascular Circulation[mesh]) OR "primary motor"[Title/Abstract] OR "primary visual"[Title/Abstract] OR sensorimotor[Title/Abstract] OR "motor area"[Title/Abstract] OR oxygenation[Title/Abstract] OR (Motor Cortex[mesh]) OR "visual cortex"[Title/Abstract] OR (Acoustic Stimulation[mesh]))"""
    query_for_3or4s = """((Temporal Lobe[mesh]) OR (Prefrontal Cortex[mesh]) OR (Pattern Recognition, Visual[mesh]) OR (Visual Perception[mesh]) OR semantic[Title/Abstract] OR verbal[Title/Abstract] OR "left inferior"[Title/Abstract] OR retrieval[Title/Abstract] OR memory[Title/Abstract] OR language[Title/Abstract] OR Memory[mesh])"""
    #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR affective[Title/Abstract] OR Affect[mesh] OR dissociation[Title/Abstract] OR Arousal[mesh] OR (Decision Making[mesh]) OR happy[Title/Abstract] OR attentional[Title/Abstract] or networks[Title/Abstract] or Cognition[mesh])"""
    #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR affective[Title/Abstract] OR Affect[mesh] OR Arousal[mesh] OR (Decision Making[mesh]) OR happy[Title/Abstract] OR attentional[Title/Abstract] or Cognition[mesh])"""
    query_for_5or6s = """(Amygdala OR Emotions[mesh] OR emotion*[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR Affect[mesh]"""
    #query_for_5or6s = """(Amygdala OR Emotional OR emotionally[Title/Abstract] OR (Facial Expression[mesh]) OR faces[Title/Abstract] OR facial[Title/Abstract] OR Affect[mesh] OR Arousal[mesh] OR (Decision Making[mesh])"""
    query_to_try = base_query + " AND " + query_for_5or6s# + " NOT (" + query_for_1or2s + " OR " + query_for_3or4s + ")"

    #(pr, re) = calc_query_performance(A, query_to_try, A+B)
    (pr, re) = calc_query_performance(A, query_for_5or6s, A+B)
    print "Precision:", round(pr, 2)
    print "Recall:", round(re, 2)



    #filtered = pubmed.filter_pmids(B, "Emotions[mesh]")
    filtered = pubmed.filter_pmids(B, query_for_5or6s)
    for pmid in filtered:
    #    print pmid
        pass

    if (False):
        pmids = pubmed.filter_pmids(B, "Visual Cortex[mesh]")
        for pmid in pmids:
            print pmid
        print "\n"
        for pmid in pmids:
            print pmid,
            if pmid in healthy_1991to2001_3or4: print "3or4",
            if pmid in healthy_1991to2001_5or6: print "5or6",
            print
Example #7
0
def calc_query_performance(gold_pmids, query, recall_filter_pmids=None):
    print query
    if not recall_filter_pmids:
        recall_filter_pmids = gold_pmids
    query_finds_recall_filter_pmids = pubmed.filter_pmids(recall_filter_pmids, query)
    query_finds_recall_filter_pmids_set = set(query_finds_recall_filter_pmids)
    gold_set = set(gold_pmids)
    print len(gold_pmids)
    print len(recall_filter_pmids)
    print len(query_finds_recall_filter_pmids)
    print len(gold_set & query_finds_recall_filter_pmids_set)
    print len(query_finds_recall_filter_pmids_set)
    precision = float(len(gold_set & query_finds_recall_filter_pmids_set)) / len(query_finds_recall_filter_pmids_set)
    
    recall = float(len(gold_set & query_finds_recall_filter_pmids_set)) / len(gold_pmids)
    return(precision, recall)
Example #8
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal         = pubmed.journal(all_reuse_pmids)
    year            = pubmed.year_published(all_reuse_pmids)
    date_published  = pubmed.date_published(all_reuse_pmids)
    medline_status  = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse    = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa     = pubmed.is_open_access(all_reuse_pmids)
    metaanal        = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)
    
    oa_excerpts     = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE)

    biolink_filter   = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))')
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter    = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])')

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids]
    
    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids]
    print word_filters
    
    reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters)))
    reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id+reuse_pmcid]
    return(full_dict)
def get_all_pmids():
    pmc_pmids = get_pmc_results(pmc_query)
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    all_pmids_after_filter = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query)
    return(all_pmids_after_filter)
def get_all_ae_pmids(query_pmids, complete_ae_pmids):
    all_ae_pmids = [pmid for pmid in query_pmids if pmid in complete_ae_pmids]  
    return(all_ae_pmids)  

def get_all_shared_pmids(gds_pmids, ae_pmids):
    all_shared_pmids = list(set(gds_pmids + ae_pmids))
    return(all_shared_pmids)
        
if (False):
    pmids = get_all_pmids()
    print(len(pmids))

    pmc_pmids = get_pmc_results(pmc_query)
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    pmids = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query)
    gds_pmids = get_all_gds_pmids(pmids)
    complete_ae_pmids = get_complete_arrayexpress_pmids()
    ae_pmids = get_all_ae_pmids(pmids, complete_ae_pmids)
    shared_pmids = get_all_shared_pmids(gds_pmids, ae_pmids)
    shared_proportion = (len(shared_pmids)+0.0)/len(pmids)
    print shared_proportion

# Too many duplicates of this.  Need to refactor duplicate code!
def _map_booleans_to_flags(list_of_True_False):
    mapping = {True:'1', False:'0'}
    list_of_flags = [mapping[i] for i in list_of_True_False]
    return(list_of_flags)

# Too many duplicates of this.  Need to refactor duplicate code!
def get_is_in_flags(query_pmids, base_pmids):
Example #11
0
#A = healthy_1991to2008_5or6
#B = healthy_2002to2008_not5or6
#A = healthy_1991to2001_3or4
#B = healthy_1991to2001_1or2 + healthy_1991to2001_5or6 # + healthy_1991to2001_7

#A = healthy_1991to2001_5or6
#B = healthy_1991to2001_1or2 + healthy_1991to2001_3or4


from data import neuroethicslike
#A = neuroethicslike.pubmed_neurethicslike_query_results
#A = neuroethicslike.pubmed_fmri_neuroethicslike_query_results

base_query = """("fmri"[text] OR "magnetic resonance imaging"[mesh]) AND ((neurosciences[mesh] OR neuroscience[Title/Abstract] OR neurology[mesh]) AND (ethics[sh] OR ethical[Title/Abstract] OR "bioethical issues"[mesh] OR "ethics, medical"[mesh] OR "legislation and jurisprudence"[Subheading])) OR neuroethic*[Title/Abstract]"""
base = pubmed.search(base_query)
A = pubmed.filter_pmids(base, "Personal Autonomy")

dist = coveyquery.get_mesh_frequency_distributions(A, getter=pubmed.mesh_basic)
print "\n\nTop list for (A, getter=pubmed.mesh_basic):"
coveyquery.print_frequency_proportion(A, dist, 150)

if False:
    dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.article_title)
    print "\n\nTop list for (A, getter=pubmed.article_title):"
    coveyquery.print_frequency_proportion(A, dist, 30)

    dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.abstract)
    print "\n\nTop list for (A, getter=pubmed.abstract):"
    coveyquery.print_frequency_proportion(A, dist, 30)

dist = coveyquery.get_text_frequency_distributions(A, getter=pubmed.title_and_abstract)
Example #12
0
def get_PMIDs_from_DOIs(DOIs):
    annotated_DOIs = [doi + "[doi]" for doi in DOIs]
    DOI_query_string = " OR ".join(annotated_DOIs)
    PMIDs = filter_pmids("1", DOI_query_string)
    return (PMIDs)
Example #13
0
    return (all_ae_pmids)


def get_all_shared_pmids(gds_pmids, ae_pmids):
    all_shared_pmids = list(set(gds_pmids + ae_pmids))
    return (all_shared_pmids)


if (False):
    pmids = get_all_pmids()
    print(len(pmids))

    pmc_pmids = get_pmc_results(pmc_query)
    portal_pmids = datasources.portal_pmids.get_all_pmids()
    all_pmids_before_filter = pmc_pmids + portal_pmids
    pmids = pubmed.filter_pmids(all_pmids_before_filter, pubmed_query)
    gds_pmids = get_all_gds_pmids(pmids)
    complete_ae_pmids = get_complete_arrayexpress_pmids()
    ae_pmids = get_all_ae_pmids(pmids, complete_ae_pmids)
    shared_pmids = get_all_shared_pmids(gds_pmids, ae_pmids)
    shared_proportion = (len(shared_pmids) + 0.0) / len(pmids)
    print shared_proportion


# Too many duplicates of this.  Need to refactor duplicate code!
def _map_booleans_to_flags(list_of_True_False):
    mapping = {True: '1', False: '0'}
    list_of_flags = [mapping[i] for i in list_of_True_False]
    return (list_of_flags)

Example #14
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal = pubmed.journal(all_reuse_pmids)
    year = pubmed.year_published(all_reuse_pmids)
    date_published = pubmed.date_published(all_reuse_pmids)
    medline_status = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa = pubmed.is_open_access(all_reuse_pmids)
    metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)

    oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids,
                                            "(GSE.\d|GDS.\d|omnibus)",
                                            flags=re.IGNORECASE | re.MULTILINE)

    biolink_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))'
    )
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])'
    )

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [
        ";".join([term for term in has_mesh if pmid in has_mesh[term]])
        for pmid in all_reuse_pmids
    ]

    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [
        ";".join([word for word in has_word if pmcid in has_word[word]])
        for pmcid in all_reuse_pmcids
    ]
    print word_filters

    reuse_pmid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmids,
            zip(reuse_affiliation, journal, year, date_published,
                medline_status, is_geo_reuse, reuse_is_oa, metaanal,
                mesh_filters)))
    reuse_pmcid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmcids,
            zip(biolink_filter, basic_reuse_filter, creation_filter,
                oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id + reuse_pmcid] = vals + (
            "|", ) + reuse_pmcid_dict[reuse_pmcid] + (
                "|", ) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id + reuse_pmcid]
    return (full_dict)
def get_all_gds_pmids(pmids):
    all_pmids_after_filter = pubmed.filter_pmids(pmids, "pubmed_gds[filter]")
    return(all_pmids_after_filter)
Example #16
0
def get_PMIDs_from_DOIs(DOIs):
    annotated_DOIs = [doi + "[doi]" for doi in DOIs]
    DOI_query_string = " OR ".join(annotated_DOIs)
    PMIDs = filter_pmids("1", DOI_query_string)
    return(PMIDs)
Example #17
0
def get_all_gds_pmids(pmids):
    all_pmids_after_filter = pubmed.filter_pmids(pmids, "pubmed_gds[filter]")
    return (all_pmids_after_filter)