Ejemplo n.º 1
0
def test_splice_site_str_to_tuple():
    from outrigger.validate.check_splice_sites import splice_site_str_to_tuple

    test = splice_site_str_to_tuple('GT/AG,AT/AC')
    true = 'GT/AG', 'AT/AC'

    assert test == true
Ejemplo n.º 2
0
    def execute(self):
        valid_splice_sites = check_splice_sites.splice_site_str_to_tuple(
            self.valid_splice_sites)

        for splice_name, splice_abbrev in common.SPLICE_TYPES:
            splice_name_spaces = splice_name.replace('_', ' ').title()
            util.progress('Finding valid splice sites in {} ({}) '
                          'splice type ...'.format(splice_name_spaces,
                                                   splice_abbrev.upper()))
            isoform_exons = common.SPLICE_TYPE_ISOFORM_EXONS[splice_abbrev]

            validated_folder = os.path.join(self.index_folder, splice_abbrev,
                                            'validated')
            self.maybe_make_folder(validated_folder)

            splice_sites_seriess = []

            for isoform, exons in isoform_exons.items():
                valid_str = ' or '.join(valid_splice_sites)
                util.progress('\tFinding valid splice sites for {isoform} of'
                              ' {splice_name} events which match '
                              '{valid_splice_sites}'
                              '...'.format(isoform=isoform,
                                           splice_name=splice_name_spaces,
                                           valid_splice_sites=valid_str))
                exon_pairs = zip(exons, exons[1:])
                for exonA, exonB in exon_pairs:
                    util.progress('\t\tFinding splice sites for {exonA} and '
                                  '{exonB} ...'.format(exonA=exonA,
                                                       exonB=exonB))
                    intron_splice_site = self.exon_pair_splice_sites(
                        exonA, exonB, splice_abbrev)
                    splice_sites_seriess.append(intron_splice_site)
                    util.done(4)
                util.done(3)
            splice_sites = pd.concat(splice_sites_seriess, axis=1)

            csv = os.path.join(self.index_folder, splice_abbrev,
                               'splice_sites.csv')
            util.progress('\tWriting splice sites to {csv} ...'.format(
                csv=csv))
            splice_sites.to_csv(csv)
            util.done(3)

            n_total = len(splice_sites.groupby(level=0, axis=0))
            splice_sites_is_valid = splice_sites.isin(valid_splice_sites)
            valid_events_rows = splice_sites_is_valid.all(axis=1)
            splice_sites_validated = splice_sites.loc[valid_events_rows]
            n_valid = len(splice_sites_validated.groupby(level=0, axis=0))

            util.progress("\tValidated {valid}/{total} {splice_name} "
                          "({splice_abbrev}) events. "
                          "".format(valid=n_valid, total=n_total,
                                    splice_name=splice_name_spaces,
                                    splice_abbrev=splice_abbrev.upper()))

            original_events_csv = os.path.join(self.input_index,
                                               splice_abbrev, EVENTS_CSV)
            validated_events_csv = os.path.join(validated_folder, EVENTS_CSV)
            util.progress('\tWriting validated events to {csv} ...'.format(
                csv=validated_events_csv))

            with open(validated_events_csv, 'w') as f_validated:
                with open(original_events_csv) as f_original:
                    for i, line in enumerate(f_original):
                        if i == 0:
                            f_validated.write(line)
                            continue
                        if line.split(',')[0] in splice_sites_validated.index:
                            f_validated.write(line)
            util.done(3)