Example #1
0
def get_response_dict(id_type, ids, pmc_query):
    response_dict = {}
    num_ids = len(ids)
    id_counter = 0
    geo_instance = geo.GEO()
    for accession in ids:
        print accession,
        id_counter += 1
        reuse_pmcids = get_accession_in_pmc_fulltext(id_type, accession,
                                                     pmc_query)
        if ((not reuse_pmcids) or ("<ERROR>" in "".join(reuse_pmcids))):
            print " Nope"
            continue
        stripped_accession = geo.get_stripped_accession(accession)
        if id_type == "GSE":
            gse_accessions = [stripped_accession]
            gds_accessions = [
                geo.get_stripped_accession(acc)
                for acc in geo.get_gds_from_gse("GSE" + stripped_accession)
            ]
        else:
            gds_accessions = [stripped_accession]
            gse_accessions = [
                geo.get_stripped_accession(acc)
                for acc in geo.get_gse_from_gds("GDS" + stripped_accession)
            ]

        try:
            submit_pmids = geo_instance.pmids("GSE" + gse_accessions[0])
        except Exception:
            continue

        print id_counter, "of", num_ids, ":", stripped_accession, "--", (
            submit_pmids), "; ", len(reuse_pmcids)
        for reuse_pmcid in reuse_pmcids:
            reuse_pmids_for_pmcid = pubmedcentral.pmcids_to_pmids(reuse_pmcid)
            dict_key = (id_type + stripped_accession, reuse_pmcid)
            this_submit_contributors = flatten_unique([
                geo_instance.contributors("GSE" + gse_accession)
                for gse_accession in gse_accessions
            ])
            this_submit_authors = get_authors_and_submittors_from_accession(
                submit_pmids, this_submit_contributors)
            this_reuse_authors = get_authors_and_submittors_from_accession(
                reuse_pmids_for_pmcid)
            intersect = get_author_intersect_submit_reuse(
                this_submit_authors, this_reuse_authors)
            submit_affiliation = affiliation.institution(submit_pmids)
            release_date = geo_instance.release_date("GSE" + gse_accession)
            response_dict[dict_key] = (id_type + stripped_accession,
                                       gse_accessions, gds_accessions,
                                       submit_pmids, reuse_pmcid,
                                       reuse_pmids_for_pmcid,
                                       this_submit_authors, this_reuse_authors,
                                       intersect, submit_affiliation,
                                       release_date)
            #print response_dict[dict_key]
    return (response_dict)
Example #2
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal         = pubmed.journal(all_reuse_pmids)
    year            = pubmed.year_published(all_reuse_pmids)
    date_published  = pubmed.date_published(all_reuse_pmids)
    medline_status  = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse    = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa     = pubmed.is_open_access(all_reuse_pmids)
    metaanal        = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)
    
    oa_excerpts     = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE)

    biolink_filter   = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))')
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter    = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])')

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids]
    
    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids]
    print word_filters
    
    reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters)))
    reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id+reuse_pmcid]
    return(full_dict)
Example #3
0
def get_response_dict(id_type, ids, pmc_query):
    response_dict = {}
    num_ids = len(ids)
    id_counter = 0
    geo_instance = geo.GEO()
    for accession in ids:
        print accession,
        id_counter += 1
        reuse_pmcids = get_accession_in_pmc_fulltext(id_type, accession, pmc_query)        
        if ((not reuse_pmcids) or ("<ERROR>" in "".join(reuse_pmcids))):
            print " Nope"
            continue
        stripped_accession = geo.get_stripped_accession(accession)  
        if id_type=="GSE":
            gse_accessions = [stripped_accession]
            gds_accessions = [geo.get_stripped_accession(acc) for acc in geo.get_gds_from_gse("GSE"+stripped_accession)]
        else:
            gds_accessions = [stripped_accession]  
            gse_accessions = [geo.get_stripped_accession(acc) for acc in geo.get_gse_from_gds("GDS"+stripped_accession)]
            
        try:       
            submit_pmids = geo_instance.pmids("GSE"+gse_accessions[0])
        except Exception:
            continue
            
        print id_counter, "of", num_ids, ":", stripped_accession, "--", (submit_pmids), "; ", len(reuse_pmcids)
        for reuse_pmcid in reuse_pmcids:
            reuse_pmids_for_pmcid = pubmedcentral.pmcids_to_pmids(reuse_pmcid)
            dict_key = (id_type+stripped_accession, reuse_pmcid)
            this_submit_contributors = flatten_unique([geo_instance.contributors("GSE"+gse_accession) for gse_accession in gse_accessions])
            this_submit_authors = get_authors_and_submittors_from_accession(submit_pmids, this_submit_contributors)
            this_reuse_authors   = get_authors_and_submittors_from_accession(reuse_pmids_for_pmcid)
            intersect = get_author_intersect_submit_reuse(this_submit_authors, this_reuse_authors) 
            submit_affiliation = affiliation.institution(submit_pmids)
            release_date = geo_instance.release_date("GSE"+gse_accession)     
            response_dict[dict_key] = (id_type+stripped_accession, gse_accessions, gds_accessions, submit_pmids, reuse_pmcid, reuse_pmids_for_pmcid, this_submit_authors, this_reuse_authors, intersect, submit_affiliation, release_date)
            #print response_dict[dict_key]
    return(response_dict)
Example #4
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal = pubmed.journal(all_reuse_pmids)
    year = pubmed.year_published(all_reuse_pmids)
    date_published = pubmed.date_published(all_reuse_pmids)
    medline_status = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa = pubmed.is_open_access(all_reuse_pmids)
    metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)

    oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids,
                                            "(GSE.\d|GDS.\d|omnibus)",
                                            flags=re.IGNORECASE | re.MULTILINE)

    biolink_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))'
    )
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])'
    )

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [
        ";".join([term for term in has_mesh if pmid in has_mesh[term]])
        for pmid in all_reuse_pmids
    ]

    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [
        ";".join([word for word in has_word if pmcid in has_word[word]])
        for pmcid in all_reuse_pmcids
    ]
    print word_filters

    reuse_pmid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmids,
            zip(reuse_affiliation, journal, year, date_published,
                medline_status, is_geo_reuse, reuse_is_oa, metaanal,
                mesh_filters)))
    reuse_pmcid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmcids,
            zip(biolink_filter, basic_reuse_filter, creation_filter,
                oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id + reuse_pmcid] = vals + (
            "|", ) + reuse_pmcid_dict[reuse_pmcid] + (
                "|", ) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id + reuse_pmcid]
    return (full_dict)