Esempio n. 1
0
    def _createDatabase(self, destDir, ds_file):
        tabixIndexedFile = TabixIndexer.index(destDir=destDir,
                                              inputFilename=ds_file,
                                              preset="vcf")
        baseDSFile = os.path.basename(tabixIndexedFile)

        return baseDSFile
    def _createDatabase(
        self, destDir, ds_file, ds_match_mode, index_column_names, annotation_column_names, column_names
    ):
        index_columns = []
        for index_column_name in index_column_names:  # ensures that all index columns are in the column names list
            index_columns += [column_names.index(index_column_name)]

        if len(index_columns) != 3 and len(index_columns) != 5:
            raise ValueError("Wrong number of index columns.  Must be a comma separated list of length 3 or 5.")

        column_names = set(column_names)
        annotation_column_names = set(annotation_column_names)

        # Read the column names and determine whether they exist or not in the file
        data = pandas.read_csv(filepath_or_buffer=ds_file, delimiter="\t", iterator=True, chunksize=1)
        for chunk in data:
            index = chunk.columns
            fieldNames = set(index)
            missingColumns = fieldNames.difference(column_names)
            if len(missingColumns) != 0:
                msg = "The input tsv, %s, is missing the following columns: %s." % (
                    ds_file,
                    string.join(missingColumns, ", "),
                )
                raise InputMismatchException(msg)

            missingColumns = annotation_column_names.difference(fieldNames)
            if len(missingColumns) != 0:
                msg = "The input tsv, %s, is missing the following annotation columns: %s." % (
                    ds_file,
                    string.join(missingColumns),
                )
                raise InputMismatchException(msg)
            break

        # Iterate through the file and determine column's data type
        data = pandas.read_csv(
            filepath_or_buffer=ds_file,
            delimiter="\t",
            iterator=True,
            chunksize=10000,
            usecols=annotation_column_names,
            na_values=["", ".", "-"],
        )
        for chunk in data:
            index = chunk.columns

            # Missing values default to float data type
            for idx in index:
                if ds_match_mode != "exact" or len(index_column_names) != 3:
                    if numpy.issubdtype(chunk[idx].dtype, numpy.inexact):
                        if idx not in self.columnDataTypes or self.columnDataTypes[idx] not in ("String",):
                            self.columnDataTypes[idx] = "Float"
                    elif numpy.issubdtype(chunk[idx].dtype, numpy.integer):
                        if idx not in self.columnDataTypes or self.columnDataTypes[idx] not in ("Float", "String"):
                            self.columnDataTypes[idx] = "Integer"
                    elif numpy.issubdtype(chunk[idx].dtype, numpy.bool_):
                        self.columnDataTypes[idx] = "Flag"
                    else:
                        self.columnDataTypes[idx] = "String"
                else:
                    self.columnDataTypes[idx] = "String"

            if ds_match_mode != "exact" or len(index_column_names) != 3:
                break

        tabixIndexedFile = TabixIndexer.index(
            destDir=destDir, inputFilename=ds_file, fileColumnNumList=index_columns, preset="tsv"
        )
        baseDSFile = os.path.basename(tabixIndexedFile)
        logging.getLogger(__name__).info("%s file was created." % tabixIndexedFile)
        return baseDSFile
    def _createDatabase(self, destDir, ds_file):
        tabixIndexedFile = TabixIndexer.index(destDir=destDir, inputFilename=ds_file, preset="vcf")
        baseDSFile = os.path.basename(tabixIndexedFile)

        return baseDSFile