def summary_from_link(db, dbfrom, id, out=None): """Use a link query to find a list of summaries.""" webenv, query_key = ew.webenv_link(db, dbfrom, id) return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
def summary_from_search(db, term, field, out=None): """Use a search query to find a list of summaries.""" webenv, query_key = ew.webenv_search(db, term, field) return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
def summary_bioprojects(gid, species_path): """Fast batch download of bioprojects using esummary instead of the entire record.""" num_seen = 0 num_dled = 0 # Check for already-processed projects. pids_fn = path.join(species_path, 'bioproject_ids.dat') seen_pids = parse_seen_project_ids(pids_fn) pids = set(ew.idlist_link('bioproject', 'genome', gid)) num_seen = len(pids) if not FORCE: pids -= seen_pids else: with open(pids_fn, 'w') as outf_pids: pass if not pids: # Empty return num_seen, num_dled pid_query = sorted(pids) # TODO may be able to get the nuccore results here via elink history. projects_root = ew.etree_summary('bioproject', pid_query) projects = projects_root.find('DocumentSummarySet') assert len(projects) == len(pids) with open(pids_fn, 'a') as outf_pids: for project in projects: try: pid = project.attrib['uid'] except: print("Skipping: failed query", file=sys.stderr) continue #reg_date = get_safe_attr(project, 'Registration_Date', '') attributes = {} attributes['taxid'] = get_safe_attr(project, 'TaxId') attributes['name'] = slugify( get_safe_attr(project, 'Project_Name', 'Unknown')) attributes['title'] = slugify( get_safe_attr(project, 'Project_Title')) nucids = ew.idlist_link('nuccore', 'bioproject', pid) if not nucids: # Empty bioproject. if DEBUG: print("Skipping: bioproject %s is empty" % pid, file=sys.stderr) print(pid, file=outf_pids) continue project_dir = path.join(species_path, pid) try: os.mkdir(project_dir) except OSError as e: if e.errno != 17: raise e # Filenames will look like: Candida_albicans_SC5314__10701.txt project_fn = '__'.join((attributes['name'], pid)) project_path = path.join(project_dir, project_fn + ".txt") # File contents: project title | taxid | nucleotide ids with open(project_path, 'w') as outf_proj: print('\n'.join((attributes['title'], attributes['taxid'])), file=outf_proj) print(','.join(nucids), file=outf_proj) print("Completed: %s" % pid, file=sys.stderr) num_dled += 1 print(pid, file=outf_pids) return num_seen, num_dled
def summary_bioprojects(gid, species_path): """Fast batch download of bioprojects using esummary instead of the entire record.""" num_seen = 0 num_dled = 0 # Check for already-processed projects. pids_fn = path.join(species_path, 'bioproject_ids.dat') seen_pids = parse_seen_project_ids(pids_fn) pids = set(ew.idlist_link('bioproject', 'genome', gid)) num_seen = len(pids) if not FORCE: pids -= seen_pids else: with open(pids_fn, 'w') as outf_pids: pass if not pids: # Empty return num_seen, num_dled pid_query = sorted(pids) # TODO may be able to get the nuccore results here via elink history. projects_root = ew.etree_summary('bioproject', pid_query) projects = projects_root.find('DocumentSummarySet') assert len(projects) == len(pids) with open(pids_fn, 'a') as outf_pids: for project in projects: try: pid = project.attrib['uid'] except: print("Skipping: failed query", file=sys.stderr) continue #reg_date = get_safe_attr(project, 'Registration_Date', '') attributes = {} attributes['taxid'] = get_safe_attr(project, 'TaxId') attributes['name'] = slugify(get_safe_attr(project, 'Project_Name', 'Unknown')) attributes['title'] = slugify(get_safe_attr(project, 'Project_Title')) nucids = ew.idlist_link('nuccore', 'bioproject', pid) if not nucids: # Empty bioproject. if DEBUG: print("Skipping: bioproject %s is empty" % pid, file=sys.stderr) print(pid, file=outf_pids) continue project_dir = path.join(species_path, pid) try: os.mkdir(project_dir) except OSError as e: if e.errno != 17: raise e # Filenames will look like: Candida_albicans_SC5314__10701.txt project_fn = '__'.join((attributes['name'], pid)) project_path = path.join(project_dir, project_fn + ".txt") # File contents: project title | taxid | nucleotide ids with open(project_path, 'w') as outf_proj: print('\n'.join((attributes['title'], attributes['taxid'])), file=outf_proj) print(','.join(nucids), file=outf_proj) print("Completed: %s" % pid, file=sys.stderr) num_dled += 1 print(pid, file=outf_pids) return num_seen, num_dled