def main(): conf = Config() logging.basicConfig(format = "%(asctime)s %(name)s %(levelname) -5s : %(message)s") log = logging.getLogger("reload_pubmed") log.setLevel(logging.DEBUG) pubmed = Pubmed() log.info("Connecting ...") conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() update_cursor = conn.cursor() log.info("Querying experiments ...") cursor.execute(""" select id, pub_pubmed, study_id, platf_id from ent_experiment where pub_pubmed is not NULL""") SPACES = re.compile("\s+") row = cursor.fetchone() while row is not None: id, pmid, study_id, platf_id = row log.info(">>> PMID: {}, STUDY: {}, PLATFORM: {}".format(pmid, study_id, platf_id)) pub = pubmed.find(pmid) if pub is None: log.error("PMID not found: {}".format(pmid)) continue pub = pub[0] for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") sql = u""" update ent_experiment set pub_title='{}', pub_authors='{}', pub_year='{}', pub_journal='{}' where id={}""".format( pub["title"], pub["short_authors"], pub["date"], pub["journal"], id) log.debug(SPACES.sub(" ", sql.strip())) update_cursor.execute(sql) row = cursor.fetchone() cursor.close() update_cursor.close() conn.close()
def process(self): params=urlparse.parse_qs(urlparse.urlparse(self.path).query) pm=Pubmed("*****@*****.**") pm.query(params["query"]) pm.fetch() pm.parse() return json.dumps(convert_articles(pm.articles))
def get_pubmed_words(self, pubmed_id): ''' return a dict in the same format as get_field_words: k=field, v=sanitized list of words ''' words = dict() pubmed = Pubmed(pubmed_id).populate() for tag in Pubmed.text_tags: try: words[tag] = getattr(pubmed, tag) except AttributeError as ae: pass return words
def insert(self, pmid, user): medline = FetchMedline([pmid]) records = medline.get_records() ## it is weird you can't do record = records[0]?? for rec in records: record = rec # get pubmed instance pubmed = Pubmed(record) # insert journal journal_no = Journal.insert(pubmed.journal_abbrev, user) # insert reference ref_no = 0 ref_query = self.query.filter_by(pubmed=pmid) if ref_query.first(): ref_no = ref_query.first().reference_no else: ref_entry = self(user, pubmed.publish_status, pubmed.citation, pubmed.year, pmid, 'PubMed script', pubmed.pdf_status, pubmed.pages, pubmed.volume, pubmed.title, pubmed.issue, journal_no) db.session.add(ref_entry) db.session.commit() ref_no = ref_entry.reference_no # insert abstract Abstract.insert(ref_no, pubmed.abstract_txt) # insert author order = 0 for name in pubmed.authors: order += 1 author_no = Author.insert(name, user) AuthorEditor.insert(author_no, ref_no, order) # insert ref_type RefType.insert(pubmed.pub_type, ref_no, 'NCBI', user) return ref_no
# First download nltk stuffs home=os.environ["HOME"] if not os.path.exists("%s/nltk_data" %home): import nltk nltk.download('all') # Download neurosynth data df = pandas.read_csv("database.txt",sep="\t") pmids = df.id.unique().tolist() print "NeuroSynth database has %s unique PMIDs" %(len(pmids)) # download abstract text email = "*****@*****.**" pm = Pubmed(email,pmc=False) articles1 = pm.get_many_articles(pmids[:10000]) articles2 = pm.get_many_articles(pmids[10000:]) articles = articles1.copy() articles.update(articles2) if not os.path.exists("articles.pkl"): pickle.dump(articles,open("articles.pkl","wb")) # Write articles to file #88390|"<text><p>sentence1</p><p>sentence2</p><p></text>" #88390|"<text><p>sentence1</p><p>sentence2</p><p></text>" # We should use utf-8 http://www.postgresql.org/docs/9.0/static/multibyte.html filey = open(output_file,"wb") count = 0
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) if "biomart.study_source" in conf: study_source_map = conf["biomart.study_source"] else: study_source_map = conf.create_element() log = task.logger() exp_port = task.ports("experiment") es = EntityServer(conf["entities"]) em = es.manager() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_experiment ( id int(11) NOT NULL, exp_name varchar(64) NOT NULL, study_id varchar(32) NOT NULL, study_source varchar(32) DEFAULT NULL, study_source_url varchar(512) DEFAULT NULL, study_link varchar(512) DEFAULT NULL, pub_pubmed varchar(32) DEFAULT NULL, pub_title varchar(300) DEFAULT NULL, pub_authors varchar(300) DEFAULT NULL, pub_year varchar(16) DEFAULT NULL, pub_journal varchar(200) DEFAULT NULL, platf_id varchar(32) NOT NULL, platf_title varchar(250) DEFAULT NULL, platf_technology varchar(96) DEFAULT NULL, PRIMARY KEY (id), KEY exp_name (exp_name), KEY pub_pubmed (pub_pubmed), KEY pub_title (pub_title), KEY pub_authors (pub_authors), KEY pub_year (pub_year), KEY pub_journal (pub_journal), KEY platf_title (platf_title), KEY platf_technology (platf_technology) ) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine)) ib = BatchInsert(cursor, "ent_experiment", ["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link", "pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal", "platf_id", "platf_title", "platf_technology"], insert_size) pubmed = Pubmed() for i, exp in enumerate(exp_port, 1): study_id = exp[0] platform_id = exp[1] study = em.find(study_id, types.SOURCE_STUDY) if study is None: log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id)) continue platf = em.find(platform_id, types.SOURCE_PLATFORM) if platf is None: log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id)) continue log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id)) pub = {} for k in ["title", "short_authors", "date", "journal"]: pub[k] = None if "pubmed" in study: pmid = study["pubmed"] if isinstance(pmid, (DataElementList, list)): pmid = pmid[0] log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid)) log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid)) try: pub = pubmed.find(pmid) if len(pub) == 0: log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id)) else: pub = pub[0] except Exception as ex: log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid)) log.exception(ex) else: pmid = None log.warn("Study {} has no 'pubmed_id' annotation".format(study_id)) if "title" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id)) elif "SO/contact_details[0]/contact_name" not in study \ and "SO/contact_details/contact_name" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id)) else: try: pub["title"] = study["title"] if "SO/contact_details[0]/contact_name" in study: pub["short_authors"] = study["SO/contact_details[0]/contact_name"] else: pub["short_authors"] = study["SO/contact_details/contact_name"] if "SO/submission/pub_date" in study: pub["date"] = study["SO/submission/pub_date"] else: pub["date"] = "" except Exception as ex: log.debug(study) log.execption(ex) for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") exp_name = "{}; {}".format(study_id, platform_id) study_source = None study_source_url = None study_link = None parts = study_id.split("-") if len(parts) >= 2 and parts[0] in study_source_map: ss = study_source_map[parts[0]] study_source = ss.get("name") study_source_url = ss.get("home_url") try: study_link = ss.get("link", "").format(parts[1]) except: pass ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link, pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"], platform_id, platf["SO/platform_title"], "") log.debug("{} experiments inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close()