def get_excerpts(): lines = open("scienceplot_new/results/reuse_pmcids.txt").readlines() accessions = [line.split("\t")[0].strip() for line in lines] reuse_pmcids = [line.split("\t")[1].strip() for line in lines] cached_article_dir = "scienceplot_new/articles" oa_excerpts = oaexcerpt.get_oa_excerpts(reuse_pmcids, "(GSE.\d|GDS.\d|omnibus|download|publicly)", 200, 200, re.IGNORECASE|re.MULTILINE, cached_article_dir) excerpts_file = open("scienceplot_new/results/excerpts.txt", "w") for (accession, pmcid, excerpt) in zip(accessions, reuse_pmcids, oa_excerpts): lookup_number = accession[3:] # remove the prefix excerpt_tagged = re.sub(lookup_number, lookup_number + "{{tag}}", excerpt) excerpts_file.write(accession + "\t" + pmcid + "\t"+ str(excerpt) + "\t" + excerpt_tagged + "\r\n") excerpts_file.close()
def get_excerpts(): lines = open("scienceplot_new/results/reuse_pmcids.txt").readlines() accessions = [line.split("\t")[0].strip() for line in lines] reuse_pmcids = [line.split("\t")[1].strip() for line in lines] cached_article_dir = "scienceplot_new/articles" oa_excerpts = oaexcerpt.get_oa_excerpts( reuse_pmcids, "(GSE.\d|GDS.\d|omnibus|download|publicly)", 200, 200, re.IGNORECASE | re.MULTILINE, cached_article_dir) excerpts_file = open("scienceplot_new/results/excerpts.txt", "w") for (accession, pmcid, excerpt) in zip(accessions, reuse_pmcids, oa_excerpts): lookup_number = accession[3:] # remove the prefix excerpt_tagged = re.sub(lookup_number, lookup_number + "{{tag}}", excerpt) excerpts_file.write(accession + "\t" + pmcid + "\t" + str(excerpt) + "\t" + excerpt_tagged + "\r\n") excerpts_file.close()
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))') basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])') has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids] print word_filters reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid] print full_dict[id+reuse_pmcid] return(full_dict)
def get_all_attributes(id_dict): all_accession_key = [vals[0] for vals in id_dict.values()] all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()]) all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()]) all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()]) reuse_affiliation = affiliation.institution(all_reuse_pmids) journal = pubmed.journal(all_reuse_pmids) year = pubmed.year_published(all_reuse_pmids) date_published = pubmed.date_published(all_reuse_pmids) medline_status = pubmed.medline_status(all_reuse_pmids) is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids) reuse_is_oa = pubmed.is_open_access(all_reuse_pmids) metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery) oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags=re.IGNORECASE | re.MULTILINE) biolink_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '(geo OR omnibus) AND microarray AND "gene expression" AND accession NOT (databases OR user OR users OR (public AND accessed) OR (downloaded AND published))' ) basic_reuse_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)') creation_filter = pubmedcentral.get_flags_for_pattern( all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])' ) has_mesh = {} for term in meshes: has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term) mesh_filters = [ ";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids ] has_word = {} for word in words: has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word) word_filters = [ ";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids ] print word_filters reuse_pmid_dict = defaultdict( tuple, zip( all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters))) reuse_pmcid_dict = defaultdict( tuple, zip( all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters))) full_dict = {} for vals in id_dict.values(): id = vals[0] reuse_pmcid = vals[4] reuse_pmid = vals[5][0] if vals[5] else "" full_dict[id + reuse_pmcid] = vals + ( "|", ) + reuse_pmcid_dict[reuse_pmcid] + ( "|", ) + reuse_pmid_dict[reuse_pmid] print full_dict[id + reuse_pmcid] return (full_dict)