Exemple #1
0
    def normalize(self):
        dd = self.unpacked_filepath
        subdirs = ['{}_families'.format(i) for i in 'AGPB']
        dn = self.normalized_filedir
        assure_dir_exists(dn)
        stem = ''
        for s in subdirs:
            nsd = os.path.join(dn, s)
            assure_dir_exists(nsd)
            in_sd = os.path.join(dd, s)
            csvf = [i for i in os.listdir(in_sd) if i.endswith('.csv')]
            csvf.sort()
            if not csvf:
                continue
            for i in csvf:
                inp_fp = os.path.join(in_sd, i)
                stem = i[:-4]
                out_dir = os.path.join(nsd, stem)
                normalize_plantlist_file(inp_fp,
                                         out_dir,
                                         stem,
                                         maj_group_id=s[0])
            _LOG.info('csvf = {}'.format(stem))

        _LOG.info("dd = {} dn = {}".format(dd, subdirs))
Exemple #2
0
def scrape_families_from_higher_group(out_dir, top_file):
    global _num_downloads_this_session
    dirname = os.path.split(top_file)[1] + '_families'
    fam_dir = os.path.join(out_dir, dirname)
    assure_dir_exists(fam_dir)
    top_content = io.open(top_file, 'rU', encoding='utf-8').read()
    soup = Soup(top_content, 'html.parser')
    nametree_list = soup.select("#nametree > li")
    _LOG.debug("will write to {}".format(dirname))

    for list_item in nametree_list:
        if _num_downloads_this_session != 0:
            m = "Sleeping for {} seconds to be polite to the server..."
            _LOG.debug(m.format(THROTTLE_BREAK))
            time.sleep(THROTTLE_BREAK)

        fam_link = list_item.select('a')
        assert len(fam_link) == 1
        fam_link = fam_link[0]
        fam_rel_url = fam_link['href']
        fam_name = fam_link.string.strip()
        fam_dest = os.path.join(fam_dir, fam_name + '.html')
        template = u'{}{}' if fam_rel_url.startswith('/') else u'{}/{}'
        fam_url = template.format(DOMAIN, fam_rel_url)
        if not os.path.exists(fam_dest):
            _LOG.debug(u"Starting download from url = {} to {}".format(
                fam_url, fam_dest))
            download_large_file(fam_url, fam_dest)
            _num_downloads_this_session += 1
            _LOG.debug(u"Download completed to .".format(fam_url, fam_dest))
        download_csv_for_family(fam_dir, fam_dest, fam_url)
Exemple #3
0
def write_taxon_json(obj, filepath):
    out_dir = os.path.split(filepath)[0]
    if out_dir:
        assure_dir_exists(out_dir)
    dtw = {}
    for k, v in obj.items():
        if isinstance(v, Taxon):
            dtw[k] = v.to_serializable_dict()
        else:
            dtw[k] = v
    write_as_json(dtw, filepath, separators=(',', ": "), indent=1)
def copy_file_list_by_linking(unpacked_dirp, normalized_dirp, file_list):
    assure_dir_exists(normalized_dirp)
    for fn in file_list:
        ufp = os.path.join(unpacked_dirp, fn)
        if os.path.exists(ufp):
            dfp = os.path.join(normalized_dirp, fn)
            if os.path.exists(dfp):
                _LOG.info(
                    'File already exists at "{}". Skipping link creation.'.
                    format(dfp))
            else:
                os.symlink(ufp, dfp)
Exemple #5
0
def normalize_darwin_core_taxonomy(source, destination, res_wrapper):
    assure_dir_exists(destination)
    manifest_fp = os.path.join(source, 'meta.xml')
    manifest_root = ET.parse(manifest_fp).getroot()
    core_paths = []
    field2index = {}
    for el in manifest_root.findall('{http://rs.tdwg.org/dwc/text/}core'):
        for sub in el:
            if sub.tag.endswith('}id'):
                field2index['id'] = int(sub.attrib['index'])
            elif sub.tag.endswith('}field'):
                nns = os.path.split(sub.attrib['term'])[-1]
                field2index[nns] = int(sub.attrib['index'])
        for f in el.findall('{http://rs.tdwg.org/dwc/text/}files'):
            for loc in f.findall('{http://rs.tdwg.org/dwc/text/}location'):
                core_paths.append(loc.text.strip())
    if len(core_paths) != 1:
        raise ValueError(
            'Did not find a single core path in DwC file ("{}") found: {}'.
            format(manifest_fp, core_paths))
    taxon_fn = core_paths[0]
    proj_out = os.path.join(destination, 'projection.tsv')
    if not os.path.exists(proj_out):
        proj_in = os.path.join(source, taxon_fn)
        write_gbif_projection_file(proj_in, proj_out, field2index)
    homemade = {
        'id': 0,
        'parentNameUsageID': 1,
        'acceptedNameUsageID': 2,
        'canonicalName': 3,
        'taxonRank': 4,
        'taxonomicStatus': 5,
        'nameAccordingTo': 6,
    }

    itd = InterimTaxonomyData()
    to_remove, to_ignore, paleos = read_gbif_projection(
        proj_out,
        itd,
        homemade,
        do_gbif_checks=isinstance(res_wrapper, GBIFWrapper))
    add_fake_root(itd)
    remove_if_tips(itd, to_remove)
    o_to_ignore = find_orphaned(itd)
    to_ignore.update(o_to_ignore)
    prune_ignored(itd, to_ignore)
    _LOG.info('writing {} paleodb ids'.format(len(paleos)))
    with open(os.path.join(destination, 'paleo.tsv'), 'w') as paleofile:
        for taxon_id in paleos:
            paleofile.write('{}\n'.format(taxon_id))
    res_wrapper.post_process_interim_tax_data(itd)
    itd.write_to_dir(destination)
Exemple #6
0
def normalize_silva_taxonomy(source, destination, res_wrapper):
    assure_dir_exists(destination)
    depends_on = res_wrapper.depends_on
    taxalotl_config = res_wrapper.config
    expect_id_fp, ncbi_mapping_res = None, None
    for dep_id in depends_on:
        dep_res = taxalotl_config.get_terminalized_res_by_id(
            dep_id, 'normalize silva')
        if not dep_res.has_been_unpacked():
            unpack_resources(taxalotl_config, [dep_id])
        if dep_res.schema.lower() == 'id list':
            dep_fp = os.path.join(dep_res.unpacked_filepath,
                                  dep_res.local_filename)
            expect_id_fp = dep_fp
        elif dep_res.schema.lower() in {'silva taxmap', "fasta silva taxmap"}:
            dep_fp = dep_res.normalized_filepath
            ncbi_mapping_res = dep_res
        else:
            raise ValueError('unrecognized dependency schema {}'.format(
                dep_res.schema))
        if not os.path.isfile(dep_fp):
            raise ValueError(
                "Silva processing dependency not found at: {}".format(dep_fp))
    if expect_id_fp is None:
        raise ValueError('ID list dependency not found.')
    if ncbi_mapping_res is None:
        raise ValueError('NCBI mapping dependency not found.')
    expect_tax_fp = os.path.join(res_wrapper.unpacked_filepath,
                                 res_wrapper.local_filename)
    if not os.path.isfile(expect_tax_fp):
        raise ValueError(
            "Silva taxon file not found at: {}".format(expect_tax_fp))
    acc_to_trim = ncbi_mapping_res.parse_acc_to_trim_from_ncbi()
    preferred = parse_silva_ids(expect_id_fp)
    itd = InterimTaxonomyData()
    part_name_to_silva_id = parse_silva_taxon_file(expect_tax_fp, preferred,
                                                   acc_to_trim, itd)
    _LOG.info('{} taxonomy IDs read'.format(len(itd.to_par)))
    res_wrapper.post_process_interim_tax_data(itd)
    itd.write_to_dir(destination)
    mapping_file = os.path.join(destination, GEN_MAPPING_FILENAME)
    write_as_json(part_name_to_silva_id,
                  mapping_file,
                  indent=2,
                  separators=(',', ': '))
Exemple #7
0
 def download(self):
     dd = self.unpacked_filepath
     assure_dir_exists(dd)
     _LOG.info("uf = {}".format(dd))
     top_files = []
     for u in self.url_list:
         pref, suff = os.path.split(u)
         if not suff:
             pref, suff = os.path.split(pref)
         _LOG.info("p = {} s = {}".format(pref, suff))
         assert suff
         dfp = os.path.join(dd, suff)
         top_files.append(dfp)
         if not os.path.exists(dfp):
             _LOG.debug("Starting download from {} to {}".format(u, dfp))
             download_large_file(u, dfp)
             _LOG.debug("Download from {} to {} completed.".format(u, dfp))
     for dfp in top_files:
         scrape_families_from_higher_group(dd, dfp)
     open(self.download_filepath, 'w')
def unpack_archive(archive_fp, unpack_fp, archive_format, wrapper):
    afl = archive_format.lower()
    if afl in ['tar+gzip']:
        _LOG.debug("gunzip_and_untar from {} to {} ...".format(
            archive_fp, unpack_fp))
        gunzip_and_untar(archive_fp, unpack_fp)
        _LOG.debug("gunzip_and_untar from {} to {} done.".format(
            archive_fp, unpack_fp))
    elif afl == 'zip':
        _LOG.debug("unzip from {} to {} ...".format(archive_fp, unpack_fp))
        unzip(archive_fp, unpack_fp)
        _LOG.debug("unzip from {} to {} done.".format(archive_fp, unpack_fp))
    elif afl == 'gzip':
        afn = os.path.split(archive_fp)[-1]
        if archive_fp.endswith(".gz"):
            fn = afn[:-3]
        elif archive_fp.endswith(".gzip"):
            fn = afn[:-5]
        else:
            raise RuntimeError(
                "Expecting gzipped archive to endwith .gz or .gzip")
        assure_dir_exists(unpack_fp)
        if wrapper.local_filename:
            dest = os.path.join(unpack_fp, wrapper.local_filename)
        else:
            dest = os.path.join(unpack_fp, fn)
        _LOG.debug("gunzip from {} to {} ...".format(archive_fp, dest))
        gunzip(archive_fp, dest)
        _LOG.debug("gunzip from {} to {} done.".format(archive_fp, dest))
    elif afl == 'text':
        assure_dir_exists(unpack_fp)
        try:
            lfn = wrapper.local_filename
            assert lfn is not None
        except:
            raise RuntimeError(
                "Resource must have a local_filename if it format=text")
        shutil.copyfile(archive_fp, os.path.join(unpack_fp, lfn))
    else:
        m = "Unpacking from {} format is not currently supported"
        raise NotImplementedError(m.format(archive_format))
Exemple #9
0
    def write_to_dir(self, destination):
        # Write out in OTT form
        d = tempfile.mkdtemp()
        fn = [
            'taxonomy.tsv', 'synonyms.tsv', 'forwards.tsv', 'about.json',
            'details.json'
        ]
        try:
            syn_order = self.write_ott_taxonomy_tsv(
                os.path.join(d, 'taxonomy.tsv'))
            write_ott_synonyms_tsv(os.path.join(d, 'synonyms.tsv'),
                                   self.synonyms, syn_order, self.details_log)
            if self.forwards:
                write_ott_forwards(os.path.join(d, 'forwards.tsv'),
                                   self.forwards)

            about_fp = os.path.join(d, 'about.json')
            write_as_json(self.about, about_fp, indent=2)
            self.finalize()
            write_ncbi_details_json(os.path.join(d, 'details.json'),
                                    self.details_log)
        except:
            for f in fn:
                tf = os.path.join(d, f)
                if os.path.exists(tf):
                    try:
                        os.remove(tf)
                    except:
                        pass
            try:
                os.rmdir(d)
            except:
                pass
            raise
        assure_dir_exists(destination)
        for f in fn:
            sfp = os.path.join(d, f)
            if os.path.exists(sfp):
                dfp = os.path.join(destination, f)
                os.rename(sfp, dfp)
        os.rmdir(d)
Exemple #10
0
def _write_d_as_tsv(header, dict_to_write, id_order, dest_path):
    if not dict_to_write:
        return
    ret = []
    pd = os.path.split(dest_path)[0]
    assure_dir_exists(pd)
    _LOG.info('Writing {} records to "{}"'.format(len(dict_to_write),
                                                  dest_path))
    with io.open(dest_path, 'w', encoding='utf-8') as outp:
        outp.write(header)
        for i in id_order:
            el = dict_to_write.get(i)
            if el is not None:
                ret.append(i)
                outp.write(el)
        oset = frozenset(ret)
        for key, line in dict_to_write.items():
            if key not in oset:
                ret.append(key)
                outp.write(line)
    return ret
Exemple #11
0
def _write_syn_d_as_tsv(header, dict_to_write, id_order, dest_path):
    ltw = []
    for i in id_order:
        synlist = dict_to_write.get(i)
        if synlist is not None:
            for p in synlist:
                ltw.append(p[1])
    oset = frozenset(id_order)
    if dict_to_write:
        for key, synlist in dict_to_write.items():
            if key not in oset:
                for syn_pair in synlist:
                    ltw.append(syn_pair[1])
    if not ltw:
        return
    x = len(ltw)
    pd = os.path.split(dest_path)[0]
    assure_dir_exists(pd)
    _LOG.info('Writing {} records to "{}"'.format(x, dest_path))
    with io.open(dest_path, 'w', encoding='utf-8') as outp:
        outp.write(header)
        for l in ltw:
            outp.write(l)
Exemple #12
0
def normalize_plantlist_file(inp_fp, out_dir, family, maj_group_id):
    _LOG.info(u'{} to {}'.format(inp_fp, out_dir))
    fam_name = unidecode(family)
    id_to_line = {fam_name: [fam_name, maj_group_id, fam_name, 'family', AGI]}
    legit_ids = {
        fam_name,
    }
    illegit_ids = set()
    name_to_id = {}
    with io.open(inp_fp, 'rU', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader)
        _LOG.info(u'header = {}'.format(header))
        for n, raw_row in enumerate(csvreader):
            # noinspection PyCompatibility
            row = [i for i in raw_row]
            taxon_id = row[0]
            fam = row[2]
            if fam != family:
                raise RuntimeError(
                    "Unexpected family in taxon {} of {}: {}".format(
                        n, family, row))
            genus = row[4]
            assert genus
            is_hybrid = bool(row[5])
            flags = 'hybrid' if is_hybrid else ''
            sp_epithet = row[6]
            infr_rank = row[7]
            infr_epi = row[8]
            par_id = None
            if infr_rank:
                rank = pl_rank_to_ott_rank[infr_rank]
                assert infr_epi
                name = ' '.join([genus, sp_epithet, infr_epi])
            else:
                if infr_epi:
                    rank = 'infraspecificname'
                    name = ' '.join([genus, sp_epithet, infr_epi])
                elif sp_epithet:
                    rank = 'species'
                    name = ' '.join([genus, sp_epithet])
                else:
                    rank = 'genus'
                    name = genus
                    par_id = fam_name
            tax_stat = row[10]
            id_to_line[taxon_id] = [taxon_id, par_id, name, rank, flags]
            if tax_stat.lower() == 'accepted':
                if name in name_to_id:
                    m = 'Name "{}" repeated in {}. IDs {} and {}. Ignoring the second...'
                    _LOG.warn(
                        m.format(name, family, name_to_id[name], taxon_id))
                    continue
                if rank == 'species' or rank == 'genus':
                    name_to_id[name] = taxon_id
                legit_ids.add(taxon_id)
            else:
                illegit_ids.add(taxon_id)
            _LOG.info(u'taxon_id={} "{}" "{}" "{}" rank={} tax_stat={}'.format(
                taxon_id, genus, sp_epithet, infr_epi, rank, tax_stat))
    # uid	|	parent_uid	|	name	|	rank	|	flags	|
    legit_gen, legit_sp, legit_infr = [], [], []
    for vid in legit_ids:
        line_el = id_to_line[vid]
        rank = line_el[3]
        if rank in ['genus', 'family']:
            if rank != 'family':
                legit_gen.append(vid)
            par_id = line_el[1]
        elif rank == 'species':
            name = line_el[2]
            gen_name = name.split(' ')[0]
            par_id = name_to_id.get(gen_name)
            if par_id is None:
                gen_gen_id = gen_name
                assert gen_gen_id not in id_to_line
                id_to_line[gen_gen_id] = [
                    gen_gen_id, fam_name, gen_name, 'genus', AGI
                ]
                name_to_id[gen_name] = gen_gen_id
                legit_gen.append(gen_gen_id)
                _LOG.info(
                    "autogenerating genus record for {}".format(gen_name))
                par_id = gen_gen_id
            legit_sp.append(vid)
        else:
            name = line_el[2]
            sp_name = ' '.join(name.split(' ')[:2])
            par_id = name_to_id.get(sp_name)
            if par_id is None:
                gen_sp_id = sp_name
                assert gen_sp_id not in id_to_line
                id_to_line[gen_sp_id] = [
                    gen_sp_id,
                    sp_name.split()[0], sp_name, 'species', AGI
                ]
                name_to_id[sp_name] = gen_sp_id
                _LOG.info(
                    "autogenerating species record for {}".format(sp_name))
                par_id = sp_name
                legit_sp.append(gen_sp_id)
            legit_infr.append(vid)
        line_el[1] = par_id
    id_order = legit_gen + legit_sp + legit_infr
    j = '\t|\t'
    taxon_fp = os.path.join(out_dir, 'taxonomy.tsv')
    assure_dir_exists(out_dir)
    with io.open(taxon_fp, 'w', encoding='utf-8') as outp:
        outp.write('{}\n'.format(
            j.join(['uid', 'parent_uid', 'name', 'rank', 'flags'])))
        outp.write('{}\n'.format(j.join(id_to_line[fam_name])))
        for i in id_order:
            outp.write(_gen_line(id_to_line[i]))
    not_accepted_fp = os.path.join(out_dir, 'not-accepted.tsv')
    with io.open(not_accepted_fp, 'w', encoding='utf-8') as outp:
        outp.write('{}\n'.format(j.join(['uid', 'name', 'rank', 'flags'])))
        for i in illegit_ids:
            line_el = id_to_line[i]
            tout = [line_el[0]] + line_el[2:]
            outp.write(_gen_line(tout))