def test_makes_df(self):
        from idiva.clf.df import v0_df
        from idiva.io import ReadVCF
        from idiva.utils import seek_then_rewind

        for k in PATHS:
            with PATHS[k].open(mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                with seek_then_rewind(fd):
                    datalines = list(ReadVCF(fd))
                with seek_then_rewind(fd):
                    df = v0_df(ReadVCF(fd))
                self.assertEqual(len(datalines), len(df))
Beispiel #2
0
 def test_reads_all_lines(self):
     for k in PATHS:
         with open_maybe_gz(PATHS[k], mode='r') as fd:
             vcf = ReadVCF(fd)
             from idiva.utils import seek_then_rewind
             with seek_then_rewind(fd, seek=None):
                 reference = len(fd.readlines())
             with vcf.rewind_when_done:
                 candidate = len(list(vcf))
             self.assertEqual(candidate, reference)
Beispiel #3
0
def check_all(fd: typing.TextIO):
    from idiva.utils import seek_then_rewind

    try:
        with seek_then_rewind(fd):
            bag_of_assumptions.alt_column(fd)
    except (AssertionError, RuntimeError):
        yield {'alt_column': False}
    else:
        yield {'alt_column': True}

    try:
        with seek_then_rewind(fd):
            bag_of_assumptions.id_is_unique(fd)
    except (AssertionError, RuntimeError):
        yield {'The ID column is_unique': False}
    else:
        yield {'The ID column is_unique': True}

    try:
        with seek_then_rewind(fd):
            bag_of_assumptions.samples_column(fd)
    except (AssertionError, RuntimeError):
        yield {'samples_column': False}
    else:
        yield {'samples_column': True}

    try:
        with seek_then_rewind(fd):
            bag_of_assumptions.ref_column(fd)
    except (AssertionError, RuntimeError):
        yield {'ref_column': False}
    else:
        yield {'ref_column': True}

    try:
        with seek_then_rewind(fd):
            bag_of_assumptions.format_is_gt(fd)
    except (AssertionError, RuntimeError):
        yield {'format_is_gt': False}
    else:
        yield {'format_is_gt': True}
    def test_md5_head(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.utils import seek_then_rewind

        for (k, file) in PATHS_LARGE_HEAD.items():
            with open_maybe_gz(file, mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                with seek_then_rewind(fd, seek=0):
                    import hashlib
                    reference = hashlib.md5(fd.read().encode()).hexdigest()

                candidate = ReadVCF(fd).md5

                self.assertEqual(candidate, reference)
Beispiel #5
0
def head(src: io.TextIOBase, n=100) -> io.TextIOBase:
    from idiva.utils import seek_then_rewind
    from tempfile import NamedTemporaryFile

    with NamedTemporaryFile(mode='w') as tf:
        with seek_then_rewind(src):
            # Write meta and header

            line = "##"
            while line.startswith("##"):
                line = src.readline().strip()
                print(line, file=tf)

            assert line.startswith("#")

            # Write a few datalines

            for _ in range(n):
                line = src.readline().strip()
                assert not line.startswith("#")
                print(line, file=tf, flush=True)

        with open(tf.name, mode='r') as fd:
            yield fd
def open_maybe_gz(file,
                  *,
                  mode='r') -> typing.Union[io.TextIOBase, io.BytesIO]:
    """
    Open `file` for reading that could be a
     - file descriptor
     - path to file
     - path to gzipped file

    `mode` is either 'r' or 'rb', and has to be specified.

    Usage:
        with open_maybe_gz(path_to_file, mode='r') as fd:
            print(fd.read())
    """

    assert mode in ['r', 'rb']

    if isinstance(file, io.TextIOBase):
        assert (mode == 'r'), "Can't convert TextIOBase to mode='rb'."
        yield file
        return

    if isinstance(file, io.IOBase):
        assert isinstance(file, (io.BufferedIOBase, io.TextIOBase))

        with contextlib.ExitStack() as stack:
            import gzip
            try:
                with seek_then_rewind(file, seek=0):
                    with gzip.open(file) as test:
                        test.read(2)
            except gzip.BadGzipFile:
                # Assume this is not a gzipped stream
                pass
            else:
                # gzip didn't complain
                # interpret this as a gzipped stream
                file = stack.enter_context(gzip.open(file))

            if (mode == 'r'):
                if not isinstance(file, io.TextIOBase):
                    assert isinstance(file, io.BufferedIOBase)
                    file = stack.enter_context(io.TextIOWrapper(file))

            yield file

        return

    if isinstance(file, str) and re.match(r"^(http|https|ftp)://", file):
        from idiva.download import download
        with download(file).now.open(mode='rb') as fd:
            with open_maybe_gz(fd, mode=mode) as fd:
                yield fd
        return

    from pathlib import Path
    assert Path(file).is_file()

    file = str(file)

    if file.endswith(".gz"):
        import gzip
        with gzip.open(file, mode='rb') as fd:
            if (mode == 'r'):
                yield io.TextIOWrapper(fd)
            elif (mode == 'rb'):
                yield fd
    else:
        with open(file, mode=mode) as fd:
            yield fd
Beispiel #7
0
 def rewind_when_done(self):
     from idiva.utils import seek_then_rewind
     with seek_then_rewind(self.fd, seek=None):
         yield
Beispiel #8
0
    def preload_all(self):
        from idiva.utils import seek_then_rewind
        with seek_then_rewind(self.fd, seek=None):
            self.datalines = list(iter(self.datalines))

        return self
    def feature_extraction_batch(self, reader_ctrl: ReadVCF, reader_case: ReadVCF, names_ctrl: List[str],
                                 names_case: List[str], batches_ctrl: List[int], batches_case: List[int], idx: int,
                                 clf, id: List[int]):
        """
        Returns a trained classifier on one batch
        loads from both files some patients (one batch) for training
        """
        with seek_then_rewind(reader_ctrl.fd, seek=reader_ctrl.dataline_start_pos) as fd_ctrl:
            with seek_then_rewind(reader_case.fd, seek=reader_case.dataline_start_pos) as fd_case:

                batch_names_ctrl = names_ctrl[:3]
                batch_names_case = names_case[:3]

                batch_names_ctrl.extend(names_ctrl[batches_ctrl[idx] + 3:batches_ctrl[idx + 1] + 3])
                batch_names_case.extend(names_case[batches_case[idx] + 3:batches_case[idx + 1] + 3])

                batch_columns_ctrl = [0, 1, 4]
                batch_columns_case = [0, 1, 4]

                batch_columns_ctrl.extend(list(range(batches_ctrl[idx] + 9, batches_ctrl[idx + 1] + 9)))
                batch_columns_case.extend(list(range(batches_case[idx] + 9, batches_case[idx + 1] + 9)))

                converter_dict_ctrl = {}

                for column in batch_names_ctrl:
                    if column not in ['CHROM', 'POS', 'ALT']:
                        converter_dict_ctrl[column] = self.convert_strang

                converter_dict_case = {}

                for column in batch_names_case:
                    if column not in ['CHROM', 'POS', 'ALT']:
                        converter_dict_case[column] = self.convert_strang

                dataframe_ctrl = pd.read_csv(fd_ctrl, sep='\t', header=None,
                                             usecols=batch_columns_ctrl,
                                             names=batch_names_ctrl, converters=converter_dict_ctrl)

                dataframe_ctrl = dataframe_ctrl.drop_duplicates(['CHROM', 'POS', 'ALT'], keep='first')

                dataframe_ctrl['ID'] = dataframe_ctrl[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1)

                dataframe_ctrl = dataframe_ctrl.drop(['CHROM', 'POS', 'ALT'], axis=1)

                dataframe_ctrl = dataframe_ctrl.set_index('ID')

                dataframe_ctrl = dataframe_ctrl.transpose()

                dataframe_ctrl = dataframe_ctrl.reindex(columns=id, fill_value=4)

                dataframe_case = pd.read_csv(fd_case, sep='\t', header=None,
                                             usecols=batch_columns_case,
                                             names=batch_names_case, converters=converter_dict_case)

                dataframe_case = dataframe_case.drop_duplicates(['CHROM', 'POS', 'ALT'], keep='first')

                dataframe_case['ID'] = dataframe_case[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1)

                dataframe_case = dataframe_case.drop(['CHROM', 'POS', 'ALT'], axis=1)

                dataframe_case = dataframe_case.set_index('ID')

                dataframe_case = dataframe_case.transpose()

                dataframe_case = dataframe_case.reindex(columns=id, fill_value=4)

                labels = np.zeros(dataframe_ctrl.shape[0])
                labels = np.append(labels, np.ones(dataframe_case.shape[0]))

                dataframe = dataframe_ctrl.append(dataframe_case)

                dataframe, labels = shuffle(dataframe, labels, random_state=0)

                """
                # for Random Forest Classifier
                clf.n_estimators += 10000
                clf.fit(dataframe, labels)
                """
                clf.partial_fit(dataframe, labels, classes=[0, 1])

        return clf
            loaded_model = pickle.load(open(filename, 'rb'))
        else:
            warnings.warn("no model saved")
            loaded_model = DummyClassifier()
        return loaded_model


def align(case: ReadVCF, ctrl: ReadVCF):
    """
    aligning case and control vcf file by joining on chrom, pos, ref and alt
    """
    from idiva.utils import seek_then_rewind

    dfs = {}
    for (k, vcf) in zip(['case', 'ctrl'], [case, ctrl]):
        with seek_then_rewind(vcf.fd, seek=vcf.dataline_start_pos) as fd:
            dfs[k] = pd.read_csv(fd, sep='\t', usecols=[0, 1, 2, 3, 4], header=None,
                                 names=["CHROM", "POS", "ID", "REF", "ALT"])
            dfs[k].index = dfs[k].index.rename(name="rowid")
            dfs[k] = dfs[k].reset_index().astype({'rowid': 'Int64'})

    dfs['case'] = dfs['case'].drop_duplicates(['CHROM', 'POS', 'REF', 'ALT'], keep='first')
    dfs['ctrl'] = dfs['ctrl'].drop_duplicates(['CHROM', 'POS', 'REF', 'ALT'], keep='first')

    df = join(case=dfs['case'], ctrl=dfs['ctrl'])

    df['CHROM'] = pd.to_numeric(df[['CHROM']].apply(translate_chrom, axis=1))

    df['CPA_ID'] = df[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1)

    df = df.set_index('CPA_ID')
Beispiel #11
0
    def translate_vcf(self, vcf) -> pd.DataFrame:
        """
        Returns a dataframe that contains the following features from a vcf file
        CHROM, POS, ID, VAR
        """

        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        with ReadVCF.open(vcf) as reader:

            with seek_then_rewind(reader.fd,
                                  seek=reader.dataline_start_pos) as fd:

                dataframe = pd.read_csv(fd,
                                        sep='\t',
                                        usecols=range(
                                            len(DataHandler.INIT_COLS)),
                                        header=None,
                                        names=DataHandler.INIT_COLS,
                                        dtype={
                                            'CHROM': np.int,
                                            'POS': np.int,
                                            'ID': np.str,
                                            'REF': np.str,
                                            'ALT': np.str
                                        })

                # Check if ALT contains only one value or several values seperated by ','
                assert (len([
                    uni for uni in dataframe['ALT'].unique().tolist()
                    if ',' in uni
                ]) == 0)

                # store only SNP variants
                dataframe = dataframe[dataframe['REF'].apply(
                    lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))]
                dataframe = dataframe[dataframe['ALT'].apply(
                    lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))]

                # Check if only SNP
                for ref in dataframe['REF']:
                    assert (len(ref) == 1)

                for alt in dataframe['ALT']:
                    assert (len(alt) == 1)

                assert (set(dataframe['REF'].unique().tolist()).issubset(
                    {'A', 'C', 'G', 'T'}))
                assert (set(dataframe['ALT'].unique().tolist()).issubset(
                    {'A', 'C', 'G', 'T'}))

        dataframe['CHROM'] = pd.to_numeric(dataframe[['CHROM']].apply(
            self.translate_chrom, axis=1))

        dataframe = self.encode_ref_alt(dataframe)

        dataframe.drop_duplicates()

        # TODO:        same CHROM POS and rsID but not same REF & ALT
        #              consequence of real world data (Kjong Nov 30)
        #              => identify samples by CHROM, POS and VAR
        #              same CHROM rsID REF ALT but not same POS
        #              => rsIDs are not completely unique !
        #              Ignore rsID (Kjong Nov 23)
        """
        
        print(len(dataframe['ID'].unique().tolist()))
        print(len(dataframe['ID'].tolist()))

                 CHROM       POS           ID REF ALT  VAR
        56638       17   1649616  rs544719440   A   G    2
        576511      17  19159733  rs540831825   A   G    2
        717227      17  27196477  rs202111951   T   C   10
        
        919995      17  34642425  rs568794696   C   A    3
        2105598     17  77663493  rs148485780   C   T    5
        
                 CHROM       POS           ID REF ALT  VAR
        56637       17   1649616  rs544719440   A   C    1
        576510      17  19159733  rs540831825   A   C    1
        717226      17  27196477  rs202111951   T   A    9
        
        919587      17  34540858  rs568794696   C   A    3
        2105592     17  77663435  rs148485780   C   T    5        

       
        """

        return dataframe