コード例 #1
0
def main():
# either search by university + author or gene + university
#  search type: 
#   select (1) for university+author, OR 
#   select (2) for university+gene   ####PROBABLY NOT GOING TO USE
    search_type = '1'
    articles = 1000
    university = 'University of Pittsburgh'
    gene = 'trpml1'
    Author_first = 'Kirill'
    Author_last = 'Kiselyov'
    
    
    Author_string = Author_last + '%2C%20' +Author_first + '%5BAuthor%5D'
    university = get_university(university)
    if int(search_type) == 1:
        docs_dict  = get_info_from_PubMed(articles, university, Author_string)
        # docs dict = {PMID:[ 'ABSTRACT_TEXT' , [('FIRST','LAST','Affiliation'),(('FIRST','LAST','Affiliation')]]}
    elif int(search_type) == 2:
        gene_field = '{0}%20AND%20' .format(gene)
        docs_xml  = get_info_from_PubMed(articles, university, gene_field , Author_first, Author_last)
        
    get_genes.main(docs_dict)
コード例 #2
0
ファイル: pubmed_xml.py プロジェクト: rothadamg/UPSITE
def main():
################search_type############################
#   select (1) for university+author, OR 
#   select (2) for university+gene   ####PROBABLY NOT GOING TO USE
#   select (3) if using a list of authors in one university

    search_type = '3'


    if int(search_type) == 1:
        articles = 10
        university = 'University of Pittsburgh'
        Author_first = 'Catalina'
        Author_last = 'Cleves Bayon'
        Author_string = Author_last + '%2C%20' +Author_first + '%5BAuthor%5D'
        Author = Author_first + ' ' + Author_last
        current_author_num = 1
        tot_authors = 1
        university = get_university(university)
        docs_dict  = get_info_from_PubMed(articles, university, Author_string)
        print 'docs dict', docs_dict
        entity_frequency_list, tfidf_lists = get_genes.main(docs_dict, Author, current_author_num, tot_authors)
        
    if int(search_type) == 2:
        gene = ''
        gene_field = '{0}%20AND%20' .format(gene)
        docs_xml  = get_info_from_PubMed(articles, university, gene_field , Author_first, Author_last)
        
    if int(search_type) == 3:
        articles = 40
#        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/Author_list'  #Madhavi suggested example
        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty.tsv'  #School of Medicine Faculty
#        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty_short.tsv'  #School of Medicine Faculty, cut short for testing
        university = 'University of Pittsburgh'
        university = get_university(university)
        Author_strings, Authors = get_name_list(file_location)
        Author_keys = []
        for Author in Authors:
            full_name = Author[0] + ' ' + Author[1]
            Author_keys.append(full_name)
        print 'author keys', Author_keys
        frequency_values = []
        for num, author_string in enumerate(Author_strings):
            indiv_dict = {}
            tot_authors = len(Author_strings)
            current_author_num = int(num) + 1
            Author_first_last = Authors[num]
            Author = Author_first_last[0] + ' ' + Author_first_last[1]
            docs_dict  = get_info_from_PubMed(articles, university, author_string)   
            entity_frequency_list, tfidf_lists = get_genes.main(docs_dict, Author, current_author_num, tot_authors)
            frequency_values.append(entity_frequency_list)
            indiv_dict[Author_keys[num]] = entity_frequency_list
            base_name = os.path.basename(file_location)
            output_file = '/home/adam/workspace/TEES/text_files/Author_Lists/output/{0}' .format(base_name)
            with open(output_file, 'a') as f:  # Just use 'w' mode in 3.x
                w = csv.writer(f)
                w.writerows(indiv_dict.items())
            tfidf_output_file = output_file + '_tfidf'
            with open(tfidf_output_file, 'a') as f2:
                pickle.dump(tfidf_lists, f2)
                
            
        Author_frequency_dict = dict(zip(Author_keys,frequency_values))
  #      Author_frequency_dict = {'Kirill Kiselyov': [('TRP', 0.148148), ('TRPC', 0.138889), ('TRPML1', 0.12963), ('ML1', 0.092593), ('TRPC2', 0.083333), ('GPCR', 0.027778), ('CatB', 0.027778), ('TRP family', 0.027778), ('TRPML1 KD', 0.009259), ('Ca(2+) release', 0.009259), ('EBP50', 0.009259), ('PLCgamma', 0.009259), ('plasma membrane receptors', 0.009259), ('trp2 mutant', 0.009259), ('proapoptotic protein Bax', 0.009259), ('phospholipase C', 0.009259), ('lysosomal SNARE proteins', 0.009259), ('lysosomal ion homeostasis', 0.009259), ('calmodulin', 0.009259), ('IP(3) receptors', 0.009259), ('mitochondrial Ca2+', 0.009259), ('G protein-coupled receptors', 0.009259), ('InaD', 0.009259), ('synaptotagmin VII', 0.009259), ('VAMP7 KD', 0.009259), ('Ca(2+) release channels', 0.009259), ('caveolin', 0.009259), ('lysosomal enzymes', 0.009259), ('tyrosine kinase receptors', 0.009259), ('caspase', 0.009259), ('TRP2', 0.009259), ('MCOLN1', 0.009259), ('scaffolding proteins', 0.009259), ('lysosomal protease cathepsin B', 0.009259), ('TRPML3', 0.009259), ('transient receptor potential', 0.009259), ('G protein coupled receptors', 0.009259), ('TRPML2', 0.009259), ('VAMP7', 0.009259), ('SYT7', 0.009259), ('Zn(2+) transporters', 0.009259), ('apolipoprotein B hydrolysis in MLIV', 0.009259), ('NEHRF', 0.009259)], 'Madhavi Ganapathiraju': [('ANKLE1', 0.25), ('ORAOV1', 0.25), ('TMEM45B', 0.25), ('human protein', 0.25)]}
        
        print Author_frequency_dict
コード例 #3
0
def main():
    ################search_type############################
    #   select (1) for university+author, OR
    #   select (2) for university+gene   ####PROBABLY NOT GOING TO USE
    #   select (3) if using a list of authors in one university

    search_type = '3'

    if int(search_type) == 1:
        articles = 10
        university = 'University of Pittsburgh'
        Author_first = 'Catalina'
        Author_last = 'Cleves Bayon'
        Author_string = Author_last + '%2C%20' + Author_first + '%5BAuthor%5D'
        Author = Author_first + ' ' + Author_last
        current_author_num = 1
        tot_authors = 1
        university = get_university(university)
        docs_dict = get_info_from_PubMed(articles, university, Author_string)
        print 'docs dict', docs_dict
        entity_frequency_list, tfidf_lists = get_genes.main(
            docs_dict, Author, current_author_num, tot_authors)

    if int(search_type) == 2:
        gene = ''
        gene_field = '{0}%20AND%20'.format(gene)
        docs_xml = get_info_from_PubMed(articles, university, gene_field,
                                        Author_first, Author_last)

    if int(search_type) == 3:
        articles = 40
        #        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/Author_list'  #Madhavi suggested example
        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty.tsv'  #School of Medicine Faculty
        #        file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty_short.tsv'  #School of Medicine Faculty, cut short for testing
        university = 'University of Pittsburgh'
        university = get_university(university)
        Author_strings, Authors = get_name_list(file_location)
        Author_keys = []
        for Author in Authors:
            full_name = Author[0] + ' ' + Author[1]
            Author_keys.append(full_name)
        print 'author keys', Author_keys
        frequency_values = []
        for num, author_string in enumerate(Author_strings):
            indiv_dict = {}
            tot_authors = len(Author_strings)
            current_author_num = int(num) + 1
            Author_first_last = Authors[num]
            Author = Author_first_last[0] + ' ' + Author_first_last[1]
            docs_dict = get_info_from_PubMed(articles, university,
                                             author_string)
            entity_frequency_list, tfidf_lists = get_genes.main(
                docs_dict, Author, current_author_num, tot_authors)
            frequency_values.append(entity_frequency_list)
            indiv_dict[Author_keys[num]] = entity_frequency_list
            base_name = os.path.basename(file_location)
            output_file = '/home/adam/workspace/TEES/text_files/Author_Lists/output/{0}'.format(
                base_name)
            with open(output_file, 'a') as f:  # Just use 'w' mode in 3.x
                w = csv.writer(f)
                w.writerows(indiv_dict.items())
            tfidf_output_file = output_file + '_tfidf'
            with open(tfidf_output_file, 'a') as f2:
                pickle.dump(tfidf_lists, f2)

        Author_frequency_dict = dict(zip(Author_keys, frequency_values))
        #      Author_frequency_dict = {'Kirill Kiselyov': [('TRP', 0.148148), ('TRPC', 0.138889), ('TRPML1', 0.12963), ('ML1', 0.092593), ('TRPC2', 0.083333), ('GPCR', 0.027778), ('CatB', 0.027778), ('TRP family', 0.027778), ('TRPML1 KD', 0.009259), ('Ca(2+) release', 0.009259), ('EBP50', 0.009259), ('PLCgamma', 0.009259), ('plasma membrane receptors', 0.009259), ('trp2 mutant', 0.009259), ('proapoptotic protein Bax', 0.009259), ('phospholipase C', 0.009259), ('lysosomal SNARE proteins', 0.009259), ('lysosomal ion homeostasis', 0.009259), ('calmodulin', 0.009259), ('IP(3) receptors', 0.009259), ('mitochondrial Ca2+', 0.009259), ('G protein-coupled receptors', 0.009259), ('InaD', 0.009259), ('synaptotagmin VII', 0.009259), ('VAMP7 KD', 0.009259), ('Ca(2+) release channels', 0.009259), ('caveolin', 0.009259), ('lysosomal enzymes', 0.009259), ('tyrosine kinase receptors', 0.009259), ('caspase', 0.009259), ('TRP2', 0.009259), ('MCOLN1', 0.009259), ('scaffolding proteins', 0.009259), ('lysosomal protease cathepsin B', 0.009259), ('TRPML3', 0.009259), ('transient receptor potential', 0.009259), ('G protein coupled receptors', 0.009259), ('TRPML2', 0.009259), ('VAMP7', 0.009259), ('SYT7', 0.009259), ('Zn(2+) transporters', 0.009259), ('apolipoprotein B hydrolysis in MLIV', 0.009259), ('NEHRF', 0.009259)], 'Madhavi Ganapathiraju': [('ANKLE1', 0.25), ('ORAOV1', 0.25), ('TMEM45B', 0.25), ('human protein', 0.25)]}

        print Author_frequency_dict