Example #1
0
def summary_from_link(db, dbfrom, id, out=None):
    """Use a link query to find a list of summaries."""
    webenv, query_key = ew.webenv_link(db, dbfrom, id)
    return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
Example #2
0
def summary_from_search(db, term, field, out=None):
    """Use a search query to find a list of summaries."""
    webenv, query_key = ew.webenv_search(db, term, field)
    return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
Example #3
0
def summary_bioprojects(gid, species_path):
    """Fast batch download of bioprojects using esummary instead of the entire
    record."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    pids = set(ew.idlist_link('bioproject', 'genome', gid))
    num_seen = len(pids)
    if not FORCE:
        pids -= seen_pids
    else:
        with open(pids_fn, 'w') as outf_pids:
            pass
    if not pids:
        # Empty
        return num_seen, num_dled
    pid_query = sorted(pids)

    # TODO may be able to get the nuccore results here via elink history.
    projects_root = ew.etree_summary('bioproject', pid_query)
    projects = projects_root.find('DocumentSummarySet')
    assert len(projects) == len(pids)

    with open(pids_fn, 'a') as outf_pids:
        for project in projects:
            try:
                pid = project.attrib['uid']
            except:
                print("Skipping: failed query", file=sys.stderr)
                continue

            #reg_date = get_safe_attr(project, 'Registration_Date', '')
            attributes = {}
            attributes['taxid'] = get_safe_attr(project, 'TaxId')
            attributes['name'] = slugify(
                get_safe_attr(project, 'Project_Name', 'Unknown'))
            attributes['title'] = slugify(
                get_safe_attr(project, 'Project_Title'))

            nucids = ew.idlist_link('nuccore', 'bioproject', pid)
            if not nucids:
                # Empty bioproject.
                if DEBUG:
                    print("Skipping: bioproject %s is empty" % pid,
                          file=sys.stderr)
                print(pid, file=outf_pids)
                continue

            project_dir = path.join(species_path, pid)
            try:
                os.mkdir(project_dir)
            except OSError as e:
                if e.errno != 17:
                    raise e

            # Filenames will look like: Candida_albicans_SC5314__10701.txt
            project_fn = '__'.join((attributes['name'], pid))

            project_path = path.join(project_dir, project_fn + ".txt")
            # File contents: project title | taxid | nucleotide ids
            with open(project_path, 'w') as outf_proj:
                print('\n'.join((attributes['title'], attributes['taxid'])),
                      file=outf_proj)
                print(','.join(nucids), file=outf_proj)
            print("Completed: %s" % pid, file=sys.stderr)
            num_dled += 1
            print(pid, file=outf_pids)
    return num_seen, num_dled
def summary_from_link(db, dbfrom, id, out=None):
    """Use a link query to find a list of summaries."""
    webenv, query_key = ew.webenv_link(db, dbfrom, id)
    return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
def summary_from_search(db, term, field, out=None):
    """Use a search query to find a list of summaries."""
    webenv, query_key = ew.webenv_search(db, term, field)
    return ew.etree_summary(db, webenv=webenv, query_key=query_key, out=out)
def summary_bioprojects(gid, species_path):
    """Fast batch download of bioprojects using esummary instead of the entire
    record."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    pids = set(ew.idlist_link('bioproject', 'genome', gid))
    num_seen = len(pids)
    if not FORCE:
        pids -= seen_pids
    else:
        with open(pids_fn, 'w') as outf_pids:
            pass
    if not pids:
        # Empty
        return num_seen, num_dled
    pid_query = sorted(pids)

    # TODO may be able to get the nuccore results here via elink history.
    projects_root = ew.etree_summary('bioproject', pid_query)
    projects = projects_root.find('DocumentSummarySet')
    assert len(projects) == len(pids)

    with open(pids_fn, 'a') as outf_pids:
        for project in projects:
            try:
                pid = project.attrib['uid']
            except:
                print("Skipping: failed query", file=sys.stderr)
                continue

            #reg_date = get_safe_attr(project, 'Registration_Date', '')
            attributes = {}
            attributes['taxid'] = get_safe_attr(project, 'TaxId')
            attributes['name'] = slugify(get_safe_attr(project, 'Project_Name',
                                                       'Unknown'))
            attributes['title'] = slugify(get_safe_attr(project,
                                                        'Project_Title'))

            nucids = ew.idlist_link('nuccore', 'bioproject', pid)
            if not nucids:
                # Empty bioproject.
                if DEBUG:
                    print("Skipping: bioproject %s is empty" % pid,
                          file=sys.stderr)
                print(pid, file=outf_pids)
                continue

            project_dir = path.join(species_path, pid)
            try:
                os.mkdir(project_dir)
            except OSError as e:
                if e.errno != 17:
                    raise e

            # Filenames will look like: Candida_albicans_SC5314__10701.txt
            project_fn = '__'.join((attributes['name'], pid))

            project_path = path.join(project_dir, project_fn + ".txt")
            # File contents: project title | taxid | nucleotide ids
            with open(project_path, 'w') as outf_proj:
                print('\n'.join((attributes['title'],
                                 attributes['taxid'])), file=outf_proj)
                print(','.join(nucids), file=outf_proj)
            print("Completed: %s" % pid, file=sys.stderr)
            num_dled += 1
            print(pid, file=outf_pids)
    return num_seen, num_dled