def get_response_dict(id_type, ids, pmc_query): response_dict = {} num_ids = len(ids) id_counter = 0 geo_instance = geo.GEO() for accession in ids: print accession, id_counter += 1 reuse_pmcids = get_accession_in_pmc_fulltext(id_type, accession, pmc_query) if ((not reuse_pmcids) or ("<ERROR>" in "".join(reuse_pmcids))): print " Nope" continue stripped_accession = geo.get_stripped_accession(accession) if id_type == "GSE": gse_accessions = [stripped_accession] gds_accessions = [ geo.get_stripped_accession(acc) for acc in geo.get_gds_from_gse("GSE" + stripped_accession) ] else: gds_accessions = [stripped_accession] gse_accessions = [ geo.get_stripped_accession(acc) for acc in geo.get_gse_from_gds("GDS" + stripped_accession) ] try: submit_pmids = geo_instance.pmids("GSE" + gse_accessions[0]) except Exception: continue print id_counter, "of", num_ids, ":", stripped_accession, "--", ( submit_pmids), "; ", len(reuse_pmcids) for reuse_pmcid in reuse_pmcids: reuse_pmids_for_pmcid = pubmedcentral.pmcids_to_pmids(reuse_pmcid) dict_key = (id_type + stripped_accession, reuse_pmcid) this_submit_contributors = flatten_unique([ geo_instance.contributors("GSE" + gse_accession) for gse_accession in gse_accessions ]) this_submit_authors = get_authors_and_submittors_from_accession( submit_pmids, this_submit_contributors) this_reuse_authors = get_authors_and_submittors_from_accession( reuse_pmids_for_pmcid) intersect = get_author_intersect_submit_reuse( this_submit_authors, this_reuse_authors) submit_affiliation = affiliation.institution(submit_pmids) release_date = geo_instance.release_date("GSE" + gse_accession) response_dict[dict_key] = (id_type + stripped_accession, gse_accessions, gds_accessions, submit_pmids, reuse_pmcid, reuse_pmids_for_pmcid, this_submit_authors, this_reuse_authors, intersect, submit_affiliation, release_date) #print response_dict[dict_key] return (response_dict)
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))') basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])') has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids] print word_filters reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid] print full_dict[id+reuse_pmcid] return(full_dict)
def get_response_dict(id_type, ids, pmc_query): response_dict = {} num_ids = len(ids) id_counter = 0 geo_instance = geo.GEO() for accession in ids: print accession, id_counter += 1 reuse_pmcids = get_accession_in_pmc_fulltext(id_type, accession, pmc_query) if ((not reuse_pmcids) or ("<ERROR>" in "".join(reuse_pmcids))): print " Nope" continue stripped_accession = geo.get_stripped_accession(accession) if id_type=="GSE": gse_accessions = [stripped_accession] gds_accessions = [geo.get_stripped_accession(acc) for acc in geo.get_gds_from_gse("GSE"+stripped_accession)] else: gds_accessions = [stripped_accession] gse_accessions = [geo.get_stripped_accession(acc) for acc in geo.get_gse_from_gds("GDS"+stripped_accession)] try: submit_pmids = geo_instance.pmids("GSE"+gse_accessions[0]) except Exception: continue print id_counter, "of", num_ids, ":", stripped_accession, "--", (submit_pmids), "; ", len(reuse_pmcids) for reuse_pmcid in reuse_pmcids: reuse_pmids_for_pmcid = pubmedcentral.pmcids_to_pmids(reuse_pmcid) dict_key = (id_type+stripped_accession, reuse_pmcid) this_submit_contributors = flatten_unique([geo_instance.contributors("GSE"+gse_accession) for gse_accession in gse_accessions]) this_submit_authors = get_authors_and_submittors_from_accession(submit_pmids, this_submit_contributors) this_reuse_authors = get_authors_and_submittors_from_accession(reuse_pmids_for_pmcid) intersect = get_author_intersect_submit_reuse(this_submit_authors, this_reuse_authors) submit_affiliation = affiliation.institution(submit_pmids) release_date = geo_instance.release_date("GSE"+gse_accession) response_dict[dict_key] = (id_type+stripped_accession, gse_accessions, gds_accessions, submit_pmids, reuse_pmcid, reuse_pmids_for_pmcid, this_submit_authors, this_reuse_authors, intersect, submit_affiliation, release_date) #print response_dict[dict_key] return(response_dict)
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags=re.IGNORECASE | re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))' ) basic_reuse_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])' ) has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [ ";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids ] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [ ";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids ] print word_filters reuse_pmid_dict = defaultdict( tuple, zip( all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict( tuple, zip( all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id + reuse_pmcid] = vals + ( "|", ) + reuse_pmcid_dict[reuse_pmcid] + ( "|", ) + reuse_pmid_dict[reuse_pmid] print full_dict[id + reuse_pmcid] return (full_dict)