Beispiel #1
0
 def test_figure(self):
     from idiva.stat.vcf_to_fisher import figure_pvalues
     for kind in ["head", "full"]:
         workdir = basepath / kind
         vcf = unlist1(workdir.glob("*.vcf.gz"))
         with ReadVCF.open(vcf) as vcf:
             for px in figure_pvalues(vcf):
                 px.f.savefig(
                     (workdir /
                      px.info['name proposal']).with_suffix(".png"))
Beispiel #2
0
def post(vcf_file: Path):
    log.info("=> Entering the postprocessing stage.")

    from idiva.stat.vcf_to_fisher import figure_pvalues
    from idiva.io.vcf import SEP

    with ReadVCF.open(vcf_file) as vcf:
        for px in figure_pvalues(vcf):
            file = vcf_file.parent / px.info['name proposal']
            log.info(F"Saving figure and data to {file}.* .")

            px.f.savefig(file.with_suffix(".png"))

            df: pandas.DataFrame = px.info['df']
            df.to_csv(file.with_suffix(".csv"), sep=SEP)
Beispiel #3
0
    def translate_vcf(self, vcf) -> pd.DataFrame:
        """
        Returns a dataframe that contains the following features from a vcf file
        CHROM, POS, ID, VAR
        """

        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        with ReadVCF.open(vcf) as reader:

            with seek_then_rewind(reader.fd,
                                  seek=reader.dataline_start_pos) as fd:

                dataframe = pd.read_csv(fd,
                                        sep='\t',
                                        usecols=range(
                                            len(DataHandler.INIT_COLS)),
                                        header=None,
                                        names=DataHandler.INIT_COLS,
                                        dtype={
                                            'CHROM': np.int,
                                            'POS': np.int,
                                            'ID': np.str,
                                            'REF': np.str,
                                            'ALT': np.str
                                        })

                # Check if ALT contains only one value or several values seperated by ','
                assert (len([
                    uni for uni in dataframe['ALT'].unique().tolist()
                    if ',' in uni
                ]) == 0)

                # store only SNP variants
                dataframe = dataframe[dataframe['REF'].apply(
                    lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))]
                dataframe = dataframe[dataframe['ALT'].apply(
                    lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))]

                # Check if only SNP
                for ref in dataframe['REF']:
                    assert (len(ref) == 1)

                for alt in dataframe['ALT']:
                    assert (len(alt) == 1)

                assert (set(dataframe['REF'].unique().tolist()).issubset(
                    {'A', 'C', 'G', 'T'}))
                assert (set(dataframe['ALT'].unique().tolist()).issubset(
                    {'A', 'C', 'G', 'T'}))

        dataframe['CHROM'] = pd.to_numeric(dataframe[['CHROM']].apply(
            self.translate_chrom, axis=1))

        dataframe = self.encode_ref_alt(dataframe)

        dataframe.drop_duplicates()

        # TODO:        same CHROM POS and rsID but not same REF & ALT
        #              consequence of real world data (Kjong Nov 30)
        #              => identify samples by CHROM, POS and VAR
        #              same CHROM rsID REF ALT but not same POS
        #              => rsIDs are not completely unique !
        #              Ignore rsID (Kjong Nov 23)
        """
        
        print(len(dataframe['ID'].unique().tolist()))
        print(len(dataframe['ID'].tolist()))

                 CHROM       POS           ID REF ALT  VAR
        56638       17   1649616  rs544719440   A   G    2
        576511      17  19159733  rs540831825   A   G    2
        717227      17  27196477  rs202111951   T   C   10
        
        919995      17  34642425  rs568794696   C   A    3
        2105598     17  77663493  rs148485780   C   T    5
        
                 CHROM       POS           ID REF ALT  VAR
        56637       17   1649616  rs544719440   A   C    1
        576510      17  19159733  rs540831825   A   C    1
        717226      17  27196477  rs202111951   T   A    9
        
        919587      17  34540858  rs568794696   C   A    3
        2105592     17  77663435  rs148485780   C   T    5        

       
        """

        return dataframe
Beispiel #4
0
        dataframe.to_csv(file_path, sep='\t', index=False)

        return dataframe


if __name__ == '__main__':
    dh = DataHandler()

    print(dh.preprocess_clinvar())

    cache = (Path(__file__).parent.parent.parent.parent /
             "input/download_cache").resolve()
    assert cache.is_dir()

    with ReadVCF.open(cache / 'control_v2.vcf') as ctrl_vcf:
        with ReadVCF.open(cache / 'case_processed_v2.vcf') as case_vcf:
            test_set = dh.create_test_set_v2(
                case_vcf=case_vcf,
                ctrl_vcf=ctrl_vcf,
            )
            print(test_set)
    """
    print(dataframe)

    cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
    assert cache.is_dir()

    file_path = str(cache) + "/cadd_full.vcf"

    dataframe = dataframe.fillna(value=".")