Esempio n. 1
0
def get_excerpts():
    lines = open("scienceplot_new/results/reuse_pmcids.txt").readlines()
    accessions = [line.split("\t")[0].strip() for line in lines]
    reuse_pmcids = [line.split("\t")[1].strip() for line in lines]
    cached_article_dir = "scienceplot_new/articles"
    oa_excerpts     = oaexcerpt.get_oa_excerpts(reuse_pmcids, "(GSE.\d|GDS.\d|omnibus|download|publicly)", 200, 200, re.IGNORECASE|re.MULTILINE, cached_article_dir)
    excerpts_file = open("scienceplot_new/results/excerpts.txt", "w")
    for (accession, pmcid, excerpt) in zip(accessions, reuse_pmcids, oa_excerpts):
        lookup_number = accession[3:]  # remove the prefix
        excerpt_tagged = re.sub(lookup_number, lookup_number + "{{tag}}", excerpt)        
        excerpts_file.write(accession + "\t" + pmcid + "\t"+ str(excerpt) + "\t" + excerpt_tagged + "\r\n")
    excerpts_file.close()
Esempio n. 2
0
def get_excerpts():
    lines = open("scienceplot_new/results/reuse_pmcids.txt").readlines()
    accessions = [line.split("\t")[0].strip() for line in lines]
    reuse_pmcids = [line.split("\t")[1].strip() for line in lines]
    cached_article_dir = "scienceplot_new/articles"
    oa_excerpts = oaexcerpt.get_oa_excerpts(
        reuse_pmcids, "(GSE.\d|GDS.\d|omnibus|download|publicly)", 200, 200,
        re.IGNORECASE | re.MULTILINE, cached_article_dir)
    excerpts_file = open("scienceplot_new/results/excerpts.txt", "w")
    for (accession, pmcid, excerpt) in zip(accessions, reuse_pmcids,
                                           oa_excerpts):
        lookup_number = accession[3:]  # remove the prefix
        excerpt_tagged = re.sub(lookup_number, lookup_number + "{{tag}}",
                                excerpt)
        excerpts_file.write(accession + "\t" + pmcid + "\t" + str(excerpt) +
                            "\t" + excerpt_tagged + "\r\n")
    excerpts_file.close()
Esempio n. 3
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal         = pubmed.journal(all_reuse_pmids)
    year            = pubmed.year_published(all_reuse_pmids)
    date_published  = pubmed.date_published(all_reuse_pmids)
    medline_status  = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse    = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa     = pubmed.is_open_access(all_reuse_pmids)
    metaanal        = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)
    
    oa_excerpts     = oaexcerpt.get_oa_excerpts(all_reuse_pmcids, "(GSE.\d|GDS.\d|omnibus)", flags = re.IGNORECASE|re.MULTILINE)

    biolink_filter   = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))')
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter    = pubmedcentral.get_flags_for_pattern(all_reuse_pmcids, '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])')

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [";".join([term for term in has_mesh if pmid in has_mesh[term]]) for pmid in all_reuse_pmids]
    
    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [";".join([word for word in has_word if pmcid in has_word[word]]) for pmcid in all_reuse_pmcids]
    print word_filters
    
    reuse_pmid_dict = defaultdict(tuple, zip(all_reuse_pmids, zip(reuse_affiliation, journal, year, date_published, medline_status, is_geo_reuse, reuse_is_oa, metaanal, mesh_filters)))
    reuse_pmcid_dict = defaultdict(tuple, zip(all_reuse_pmcids, zip(biolink_filter, basic_reuse_filter, creation_filter, oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id+reuse_pmcid] = vals + ("|",) + reuse_pmcid_dict[reuse_pmcid] + ("|",) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id+reuse_pmcid]
    return(full_dict)
Esempio n. 4
0
def get_all_attributes(id_dict):
    all_accession_key = [vals[0] for vals in id_dict.values()]
    all_submit_pmids = flatten_unique([vals[3] for vals in id_dict.values()])
    all_reuse_pmcids = flatten_unique([vals[4] for vals in id_dict.values()])
    all_reuse_pmids = flatten_unique([vals[5] for vals in id_dict.values()])

    reuse_affiliation = affiliation.institution(all_reuse_pmids)
    journal = pubmed.journal(all_reuse_pmids)
    year = pubmed.year_published(all_reuse_pmids)
    date_published = pubmed.date_published(all_reuse_pmids)
    medline_status = pubmed.medline_status(all_reuse_pmids)
    is_geo_reuse = geo_reuse.is_geo_reuse(all_reuse_pmids)
    reuse_is_oa = pubmed.is_open_access(all_reuse_pmids)
    metaanal = pubmed._get_flags_for_pattern(all_reuse_pmids, metaquery)

    oa_excerpts = oaexcerpt.get_oa_excerpts(all_reuse_pmcids,
                                            "(GSE.\d|GDS.\d|omnibus)",
                                            flags=re.IGNORECASE | re.MULTILINE)

    biolink_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '(geo OR omnibus)  AND microarray  AND "gene expression" AND accession NOT (databases OR user OR users  OR (public AND accessed) OR (downloaded AND published))'
    )
    basic_reuse_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '"gene expression omnibus" AND (submitted OR deposited)')
    creation_filter = pubmedcentral.get_flags_for_pattern(
        all_reuse_pmcids,
        '("gene expression" [text] AND "microarray" [text] AND "cell" [text] AND "rna" [text]) AND ("rneasy" [text] OR "trizol" [text] OR "real-time pcr" [text]) NOT ("tissue microarray*" [text] OR "cpg island*" [text])'
    )

    has_mesh = {}
    for term in meshes:
        has_mesh[term] = pubmed.filter_pmids(all_reuse_pmids, term)
    mesh_filters = [
        ";".join([term for term in has_mesh if pmid in has_mesh[term]])
        for pmid in all_reuse_pmids
    ]

    has_word = {}
    for word in words:
        has_word[word] = pubmedcentral.filter_pmcids(all_reuse_pmcids, word)
    word_filters = [
        ";".join([word for word in has_word if pmcid in has_word[word]])
        for pmcid in all_reuse_pmcids
    ]
    print word_filters

    reuse_pmid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmids,
            zip(reuse_affiliation, journal, year, date_published,
                medline_status, is_geo_reuse, reuse_is_oa, metaanal,
                mesh_filters)))
    reuse_pmcid_dict = defaultdict(
        tuple,
        zip(
            all_reuse_pmcids,
            zip(biolink_filter, basic_reuse_filter, creation_filter,
                oa_excerpts, word_filters)))

    full_dict = {}
    for vals in id_dict.values():
        id = vals[0]
        reuse_pmcid = vals[4]
        reuse_pmid = vals[5][0] if vals[5] else ""
        full_dict[id + reuse_pmcid] = vals + (
            "|", ) + reuse_pmcid_dict[reuse_pmcid] + (
                "|", ) + reuse_pmid_dict[reuse_pmid]
        print full_dict[id + reuse_pmcid]
    return (full_dict)