def spark_type_to_pandas_dtype(spark_type: types.DataType, *, use_extension_dtypes: bool = False) -> Dtype: """Return the given Spark DataType to pandas dtype.""" if use_extension_dtypes and extension_dtypes_available: # IntegralType if isinstance(spark_type, types.ByteType): return Int8Dtype() elif isinstance(spark_type, types.ShortType): return Int16Dtype() elif isinstance(spark_type, types.IntegerType): return Int32Dtype() elif isinstance(spark_type, types.LongType): return Int64Dtype() if extension_object_dtypes_available: # BooleanType if isinstance(spark_type, types.BooleanType): return BooleanDtype() # StringType elif isinstance(spark_type, types.StringType): return StringDtype() # FractionalType if extension_float_dtypes_available: if isinstance(spark_type, types.FloatType): return Float32Dtype() elif isinstance(spark_type, types.DoubleType): return Float64Dtype() if isinstance( spark_type, ( types.DateType, types.NullType, types.ArrayType, types.MapType, types.StructType, types.UserDefinedType, ), ): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
def object_extension_dtypes(self): return ( ["boolean", "string", BooleanDtype(), StringDtype()] if extension_object_dtypes_available else [] )
def string_extension_dtype(self): return ["string", StringDtype()] if extension_object_dtypes_available else []
def name2taxid(names, sciname=False, threads=None, data_dir=None, debug=False): '''query taxid by taxon scientific name Parameters ---------- names : list or iterable A list of species names or synonyms sciname: bool, default False By default, both scientific names and synonyms are supported; when `sciname=True`, synonyms are ignored threads : int Override the default taxonkit threads setting data_dir : str, default None Specify the location of the NCBI taxonomy `.dmp` files; by default, taxonkit searches in `~/.taxonkit/` debug : bool, default False Print debugging output, e.g., system calls to `taxonkit` Returns ------- DataFrame A two-dimensional data structure. Examples -------- >>> import pytaxonkit >>> names = ['Phyllobolus spinuliferus', 'Alteromonas putrefaciens', 'Rexia erectus'] >>> pytaxonkit.name2taxid(names) Name TaxID Rank 0 Phyllobolus spinuliferus 359607 species 1 Alteromonas putrefaciens 24 species 2 Rexia erectus 262902 species >>> pytaxonkit.name2taxid(names, sciname=True) Name TaxID Rank 0 Phyllobolus spinuliferus <NA> <NA> 1 Alteromonas putrefaciens <NA> <NA> 2 Rexia erectus <NA> <NA> ''' namelist = '\n'.join(map(str, names)) arglist = ['taxonkit', 'name2taxid', '--show-rank'] if sciname: arglist.append('--sci-name') if threads: arglist.extend(('--threads', validate_threads(threads))) if data_dir: arglist.extend(('--data-dir', validate_data_dir(data_dir))) # pragma: no cover if debug: log(*arglist) # pragma: no cover proc = Popen(arglist, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) out, err = proc.communicate(input=namelist) if proc.returncode != 0: raise TaxonKitCLIError(err) # pragma: no cover columns = { 'Name': StringDtype(), 'TaxID': UInt32Dtype(), 'Rank': StringDtype(), } data = pandas.read_csv( StringIO(out), sep='\t', header=None, names=columns, dtype=columns, index_col=False ) return data
def create_schema(phases, sources): dataframe_fields_types = { "name": StringDtype(), "schema": CategoricalDtype(settings.SCHEMAS), "collection_id": StringDtype(), "id": StringDtype(), "country": StringDtype(), "address": StringDtype(), "registrationNumber": StringDtype(), "alias": StringDtype(), "status": StringDtype(), "classification": StringDtype(), "gender": StringDtype(), "firstName": StringDtype(), "lastName": StringDtype(), "birthPlace": StringDtype(), "birthDate": StringDtype(), "idNumber": StringDtype(), "motherName": StringDtype(), "nationality": StringDtype(), } dataframe_meta = { f"{which}_{c}": t for which in ("left", "right") for c, t in dataframe_fields_types.items() } dataframe_meta["judgement"] = bool dataframe_meta["source"] = CategoricalDtype(sources) dataframe_meta["phase"] = CategoricalDtype(phases) dataframe_meta["features"] = object dataframe_meta["schema"] = StringDtype() return dataframe_meta