Beispiel #1
0
def summary_bioprojects(gid, species_path):
    """Fast batch download of bioprojects using esummary instead of the entire
    record."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    pids = set(ew.idlist_link('bioproject', 'genome', gid))
    num_seen = len(pids)
    if not FORCE:
        pids -= seen_pids
    else:
        with open(pids_fn, 'w') as outf_pids:
            pass
    if not pids:
        # Empty
        return num_seen, num_dled
    pid_query = sorted(pids)

    # TODO may be able to get the nuccore results here via elink history.
    projects_root = ew.etree_summary('bioproject', pid_query)
    projects = projects_root.find('DocumentSummarySet')
    assert len(projects) == len(pids)

    with open(pids_fn, 'a') as outf_pids:
        for project in projects:
            try:
                pid = project.attrib['uid']
            except:
                print("Skipping: failed query", file=sys.stderr)
                continue

            #reg_date = get_safe_attr(project, 'Registration_Date', '')
            attributes = {}
            attributes['taxid'] = get_safe_attr(project, 'TaxId')
            attributes['name'] = slugify(
                get_safe_attr(project, 'Project_Name', 'Unknown'))
            attributes['title'] = slugify(
                get_safe_attr(project, 'Project_Title'))

            nucids = ew.idlist_link('nuccore', 'bioproject', pid)
            if not nucids:
                # Empty bioproject.
                if DEBUG:
                    print("Skipping: bioproject %s is empty" % pid,
                          file=sys.stderr)
                print(pid, file=outf_pids)
                continue

            project_dir = path.join(species_path, pid)
            try:
                os.mkdir(project_dir)
            except OSError as e:
                if e.errno != 17:
                    raise e

            # Filenames will look like: Candida_albicans_SC5314__10701.txt
            project_fn = '__'.join((attributes['name'], pid))

            project_path = path.join(project_dir, project_fn + ".txt")
            # File contents: project title | taxid | nucleotide ids
            with open(project_path, 'w') as outf_proj:
                print('\n'.join((attributes['title'], attributes['taxid'])),
                      file=outf_proj)
                print(','.join(nucids), file=outf_proj)
            print("Completed: %s" % pid, file=sys.stderr)
            num_dled += 1
            print(pid, file=outf_pids)
    return num_seen, num_dled
def summary_bioprojects(gid, species_path):
    """Fast batch download of bioprojects using esummary instead of the entire
    record."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    pids = set(ew.idlist_link('bioproject', 'genome', gid))
    num_seen = len(pids)
    if not FORCE:
        pids -= seen_pids
    else:
        with open(pids_fn, 'w') as outf_pids:
            pass
    if not pids:
        # Empty
        return num_seen, num_dled
    pid_query = sorted(pids)

    # TODO may be able to get the nuccore results here via elink history.
    projects_root = ew.etree_summary('bioproject', pid_query)
    projects = projects_root.find('DocumentSummarySet')
    assert len(projects) == len(pids)

    with open(pids_fn, 'a') as outf_pids:
        for project in projects:
            try:
                pid = project.attrib['uid']
            except:
                print("Skipping: failed query", file=sys.stderr)
                continue

            #reg_date = get_safe_attr(project, 'Registration_Date', '')
            attributes = {}
            attributes['taxid'] = get_safe_attr(project, 'TaxId')
            attributes['name'] = slugify(get_safe_attr(project, 'Project_Name',
                                                       'Unknown'))
            attributes['title'] = slugify(get_safe_attr(project,
                                                        'Project_Title'))

            nucids = ew.idlist_link('nuccore', 'bioproject', pid)
            if not nucids:
                # Empty bioproject.
                if DEBUG:
                    print("Skipping: bioproject %s is empty" % pid,
                          file=sys.stderr)
                print(pid, file=outf_pids)
                continue

            project_dir = path.join(species_path, pid)
            try:
                os.mkdir(project_dir)
            except OSError as e:
                if e.errno != 17:
                    raise e

            # Filenames will look like: Candida_albicans_SC5314__10701.txt
            project_fn = '__'.join((attributes['name'], pid))

            project_path = path.join(project_dir, project_fn + ".txt")
            # File contents: project title | taxid | nucleotide ids
            with open(project_path, 'w') as outf_proj:
                print('\n'.join((attributes['title'],
                                 attributes['taxid'])), file=outf_proj)
                print(','.join(nucids), file=outf_proj)
            print("Completed: %s" % pid, file=sys.stderr)
            num_dled += 1
            print(pid, file=outf_pids)
    return num_seen, num_dled
Beispiel #3
0
def fetch_bioprojects(gid, species_path):
    """Slow efetch-based download of of bioprojects."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    try:
        pids = set(ew.idlist_link('bioproject', 'genome', gid))
        num_seen = len(pids)
        if not FORCE:
            pids -= seen_pids
        else:
            with open(pids_fn, 'w') as outf_pids:
                pass
        pids = sorted(pids)

        for pid in pids:
            with open(pids_fn, 'a') as outf_pids:
                nucids = ew.idlist_link('nuccore', 'bioproject', pid)
                #if not nucids:
                ## Empty bioproject.
                #if DEBUG:
                #print("Skipping: bioproject %s is empty" % pid,
                #file=sys.stderr)
                #print(pid, file=outf_pids)
                #continue

                project = ew.etree_fetch('bioproject', pid)
                attributes = parse_project_tree(project)
                if not attributes['pacc']:
                    # Look for the refseq or gb project accession code,
                    # otherwise skip this project.
                    if DEBUG:
                        if nucids:
                            print(("Skipping: bioproject %s has no RS or GB "
                                   "accession code") % pid,
                                  file=sys.stderr)
                            print("It has %d nucleotide ids attached" %
                                  len(nucids),
                                  file=sys.stderr)
                        print("Skipping: bioproject %s is empty" % pid,
                              file=sys.stderr)
                    print(pid, file=outf_pids)
                    continue

                project_dir = path.join(species_path, pid)
                try:
                    os.mkdir(project_dir)
                except OSError as e:
                    if e.errno != 17:
                        raise e

                # Filenames will look like: Candida_albicans_SC5314__10701.txt
                project_fn = '__'.join((attributes['name'], pid))

                project_path = path.join(project_dir, project_fn + ".txt")
                # Project title | taxid | project accession number(s)
                with open(project_path, 'w') as outf_proj:
                    print('\n'.join(
                        (attributes['title'], attributes['taxid'])),
                          file=outf_proj)
                    print(','.join(attributes['pacc']), file=outf_proj)
                    #print(','.join(nucids), file=outf_proj)
                print("Completed: %s" % pid, file=sys.stderr)

                num_dled += 1
                print(pid, file=outf_pids)
    except ew.RetryException as e:
        if DEBUG:
            raise e
        else:
            print("Skipping:", e, sep='\n', file=sys.stderr)
    return num_seen, num_dled
def fetch_bioprojects(gid, species_path):
    """Slow efetch-based download of of bioprojects."""
    num_seen = 0
    num_dled = 0

    # Check for already-processed projects.
    pids_fn = path.join(species_path, 'bioproject_ids.dat')
    seen_pids = parse_seen_project_ids(pids_fn)

    try:
        pids = set(ew.idlist_link('bioproject', 'genome', gid))
        num_seen = len(pids)
        if not FORCE:
            pids -= seen_pids
        else:
            with open(pids_fn, 'w') as outf_pids:
                pass
        pids = sorted(pids)

        for pid in pids:
            with open(pids_fn, 'a') as outf_pids:
                nucids = ew.idlist_link('nuccore', 'bioproject', pid)
                #if not nucids:
                    ## Empty bioproject.
                    #if DEBUG:
                        #print("Skipping: bioproject %s is empty" % pid,
                              #file=sys.stderr)
                    #print(pid, file=outf_pids)
                    #continue

                project = ew.etree_fetch('bioproject', pid)
                attributes = parse_project_tree(project)
                if not attributes['pacc']:
                    # Look for the refseq or gb project accession code,
                    # otherwise skip this project.
                    if DEBUG:
                        if nucids:
                            print(("Skipping: bioproject %s has no RS or GB "
                                   "accession code") % pid, file=sys.stderr)
                            print("It has %d nucleotide ids attached" %
                                  len(nucids), file=sys.stderr)
                        print("Skipping: bioproject %s is empty" % pid,
                              file=sys.stderr)
                    print(pid, file=outf_pids)
                    continue

                project_dir = path.join(species_path, pid)
                try:
                    os.mkdir(project_dir)
                except OSError as e:
                    if e.errno != 17:
                        raise e

                # Filenames will look like: Candida_albicans_SC5314__10701.txt
                project_fn = '__'.join((attributes['name'], pid))

                project_path = path.join(project_dir, project_fn + ".txt")
                # Project title | taxid | project accession number(s)
                with open(project_path, 'w') as outf_proj:
                    print('\n'.join((attributes['title'],
                                     attributes['taxid'])), file=outf_proj)
                    print(','.join(attributes['pacc']), file=outf_proj)
                    #print(','.join(nucids), file=outf_proj)
                print("Completed: %s" % pid, file=sys.stderr)

                num_dled += 1
                print(pid, file=outf_pids)
    except ew.RetryException as e:
        if DEBUG:
            raise e
        else:
            print("Skipping:", e, sep='\n', file=sys.stderr)
    return num_seen, num_dled