def test_create(self):
        motif_len = 3
        distance_to_start_of_motif = -3
        left_update = -distance_to_start_of_motif
        right_update = 0
        feat_generator = MotifFeatureGenerator(
            motif_len=motif_len,
            distance_to_start_of_motif=distance_to_start_of_motif,
        )
        obs_seq_mut = ObservedSequenceMutations(
            start_seq="caagtatgaatgc",
            end_seq="caagcaagatagc",
            motif_len=3,
            left_flank_len=-distance_to_start_of_motif,
            right_flank_len=0,
        )
        feat_matrix = feat_generator.get_base_features(obs_seq_mut)
        obs_seq_mut.set_start_feats(feat_matrix)
        ordered_seq_mut = ImputedSequenceMutations(
            obs_seq_mut, sorted(obs_seq_mut.mutation_pos_dict.keys()))

        # Create the base_feat_vec_dicts and base_intermediate_seqs
        base_feat_mut_steps = feat_generator.create_for_mutation_steps(
            ordered_seq_mut, left_update, right_update)
        self.assertEqual(base_feat_mut_steps[0].mutating_pos_feats,
                         16 * 0 + 4 * 0 + 1 * 2)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_new), 0)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_old), 0)
        # Neighbor feats are indexed by ignoring flanks---took me a while to fix this...
        self.assertEqual(base_feat_mut_steps[1].mutating_pos_feats,
                         16 * 2 + 4 * 1 + 1 * 0)
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_old[0],
                         16 * 1 + 4 * 0 + 1 * 0)
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_new[0],
                         16 * 1 + 4 * 0 + 1 * 0)
    def test_create(self):
        motif_len = 3
        left_update = 1
        right_update = 1
        feat_generator = MotifFeatureGenerator(motif_len=motif_len)
        obs_seq_mut = ObservedSequenceMutations(
            start_seq="aattatgaatgc",
            end_seq="atgcaagatagc",
            motif_len=3,
        )
        feat_matrix = feat_generator.get_base_features(obs_seq_mut)
        obs_seq_mut.set_start_feats(feat_matrix)
        ordered_seq_mut = ImputedSequenceMutations(
            obs_seq_mut, obs_seq_mut.mutation_pos_dict.keys())

        # Create the base_feat_vec_dicts and base_intermediate_seqs
        base_feat_mut_steps = feat_generator.create_for_mutation_steps(
            ordered_seq_mut, left_update, right_update)
        self.assertEqual(base_feat_mut_steps[0].mutating_pos_feats,
                         16 * 0 + 4 * 0 + 1 * 3)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_new), 0)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_old), 0)
        self.assertEqual(base_feat_mut_steps[1].mutating_pos_feats,
                         16 * 3 + 4 * 3 + 1 * 3)
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_old[1],
                         16 * 0 + 4 * 3 + 1 * 3)
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_new[1],
                         16 * 3 + 4 * 3 + 1 * 3)
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_new.keys(), [1])
        self.assertEqual(base_feat_mut_steps[1].neighbors_feat_old.keys(), [1])
        self.assertEqual(set(base_feat_mut_steps[4].neighbors_feat_new.keys()),
                         set([3, 5]))
        self.assertEqual(set(base_feat_mut_steps[4].neighbors_feat_old.keys()),
                         set([3, 5]))
    def test_create_downstream(self):
        motif_len = 3
        left_motif_flank_len = 2
        left_update = 2
        right_update = 0
        feat_generator = MotifFeatureGenerator(
            motif_len=motif_len,
            distance_to_start_of_motif=-left_motif_flank_len,
        )
        obs_seq_mut = ObservedSequenceMutations(
            start_seq="aaattatgaatgc",
            end_seq="aatgcaagatagc",
            motif_len=3,
            left_flank_len=left_motif_flank_len,
            right_flank_len=motif_len - 1 - left_motif_flank_len,
        )
        feat_matrix = feat_generator.get_base_features(obs_seq_mut)
        obs_seq_mut.set_start_feats(feat_matrix)
        ordered_seq_mut = ImputedSequenceMutations(
            obs_seq_mut, obs_seq_mut.mutation_pos_dict.keys())

        # Create the base_feat_vec_dicts and base_intermediate_seqs
        base_feat_mut_steps = feat_generator.create_for_mutation_steps(
            ordered_seq_mut, left_update, right_update)
        self.assertEqual(base_feat_mut_steps[0].mutating_pos_feats,
                         16 * 0 + 4 * 0 + 1 * 0)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_new), 0)
        self.assertEqual(len(base_feat_mut_steps[0].neighbors_feat_old), 0)
        self.assertEqual(base_feat_mut_steps[1].mutating_pos_feats,
                         16 * 0 + 4 * 3 + 1 * 3)
        self.assertEqual(len(base_feat_mut_steps[1].neighbors_feat_new), 0)
        self.assertEqual(len(base_feat_mut_steps[1].neighbors_feat_old), 0)
        self.assertEqual(base_feat_mut_steps[4].neighbors_feat_old.keys(), [3])
        self.assertEqual(base_feat_mut_steps[4].neighbors_feat_new.keys(), [3])
    def test_statistics(self):

        motif_len = 3

        feat_generator = MotifFeatureGenerator(motif_len=motif_len)
        seqs, metadata = read_gene_seq_csv_data(INPUT_GENES, INPUT_SEQS,
                                                motif_len)
        print get_data_statistics_print_lines(seqs, feat_generator)
    def test_create_all(self):
        motif_len = 3
        left_flank_lens = [1, 2]
        left_motif_flank_len = 1
        left_update = max(left_flank_lens)
        right_update = motif_len - 1 - min(left_flank_lens)
        feat_generator1 = MotifFeatureGenerator(
            motif_len=motif_len,
            distance_to_start_of_motif=-left_motif_flank_len,
            flank_len_offset=max(left_flank_lens) - left_motif_flank_len,
        )
        obs_seq_mut1 = ObservedSequenceMutations(
            start_seq="aaattatgaatgc",
            end_seq="aatgcaagatagc",
            motif_len=3,
            left_flank_len=max(left_flank_lens),
            right_flank_len=motif_len - 1 - min(left_flank_lens),
        )
        feat_matrix1 = feat_generator1.get_base_features(obs_seq_mut1)
        obs_seq_mut1.set_start_feats(feat_matrix1)
        ordered_seq_mut1 = ImputedSequenceMutations(
            obs_seq_mut1, obs_seq_mut1.mutation_pos_dict.keys())

        left_motif_flank_len = 2
        feat_generator2 = MotifFeatureGenerator(
            motif_len=motif_len,
            distance_to_start_of_motif=-left_motif_flank_len,
            flank_len_offset=max(left_flank_lens) - left_motif_flank_len,
        )
        obs_seq_mut2 = ObservedSequenceMutations(
            start_seq="aaattatgaatgc",
            end_seq="aatgcaagatagc",
            motif_len=3,
            left_flank_len=max(left_flank_lens),
            right_flank_len=motif_len - 1 - min(left_flank_lens),
        )
        feat_matrix2 = feat_generator2.get_base_features(obs_seq_mut2)
        obs_seq_mut2.set_start_feats(feat_matrix2)
        ordered_seq_mut2 = ImputedSequenceMutations(
            obs_seq_mut2, obs_seq_mut2.mutation_pos_dict.keys())

        # Create the base_feat_vec_dicts and base_intermediate_seqs
        base_feat_mut_steps1 = feat_generator1.create_for_mutation_steps(
            ordered_seq_mut1, left_update, right_update)
        base_feat_mut_steps2 = feat_generator2.create_for_mutation_steps(
            ordered_seq_mut2, left_update, right_update)
        self.assertEqual(base_feat_mut_steps1[0].mutating_pos_feats,
                         16 * 0 + 4 * 0 + 1 * 3)
        self.assertEqual(base_feat_mut_steps2[0].mutating_pos_feats,
                         16 * 0 + 4 * 0 + 1 * 0)
        self.assertEqual(base_feat_mut_steps1[1].neighbors_feat_old[1],
                         16 * 0 + 4 * 3 + 1 * 3)
        self.assertEqual(base_feat_mut_steps1[1].neighbors_feat_new[1],
                         16 * 3 + 4 * 3 + 1 * 3)
Beispiel #6
0
    def setUpClass(cls):
        np.random.seed(10)
        cls.motif_len = 5
        feat_gen = MotifFeatureGenerator(motif_len=cls.motif_len)
        cls.feature_vec_len = feat_gen.feature_vec_len

        cls.theta_g = np.random.rand(cls.feature_vec_len)

        motif_list = feat_gen.motif_list
        cls.fuse_idx1, cls.fuse_idx2 = cls._get_fuse_indices(motif_list)
    def test_time(self):
        """
        Just a test to see how fast things are running
        """
        np.random.seed(0)

        motif_len = 3
        seq_length = 400
        mut_per_length = 10
        left_update = 1
        right_update = 1

        feat_generator = MotifFeatureGenerator(motif_len=motif_len)

        start_seq = get_random_dna_seq(seq_length)
        # Mutate a 10th of the sequence
        end_seq = list(start_seq)
        for i in range(motif_len / 2, seq_length, mut_per_length):
            if NUCLEOTIDE_DICT[end_seq[i]] == 0:
                end_seq[i] = "t"
            else:
                end_seq[i] = NUCLEOTIDES[NUCLEOTIDE_DICT[end_seq[i]] - 1]
        end_seq = "".join(end_seq)

        obs_seq_mutation = ObservedSequenceMutations(start_seq, end_seq,
                                                     motif_len)

        st_time = time.time()
        feat_matrix = feat_generator.get_base_features(obs_seq_mutation)
        obs_seq_mutation.set_start_feats(feat_matrix)
        print "create_base_features time", time.time() - st_time

        my_order = obs_seq_mutation.mutation_pos_dict.keys()
        seq_mut_order = ImputedSequenceMutations(
            obs_seq_mutation,
            my_order,
        )
        st_time = time.time()
        mutation_steps = feat_generator.create_for_mutation_steps(
            seq_mut_order, left_update, right_update)
        print "create_for_mutation_steps time", time.time() - st_time
Beispiel #8
0
def get_shazam_theta(mutability_file,
                     substitution_file=None,
                     wide_format=False):
    """
    Take shazam csv files and turn them into our theta vector

    @param feat_generator: feature generator for model
    @param mutability_file: csv of mutability fit from SHazaM
    @param substitution_file: csv of substitution fit from SHazaM
    """

    # Read in the results from the shmulate model-fitter
    # Read mutability matrix
    mut_motif_dict = dict()
    with open(mutability_file, "r") as model_file:
        if wide_format:
            csv_reader = csv.reader(model_file, delimiter=',')
            shazam_motif_list = csv_reader.next()[1:]
            shazam_mutabilities = csv_reader.next()[1:]
            for motif, motif_val in zip(shazam_motif_list,
                                        shazam_mutabilities):
                mut_motif_dict[motif.lower()] = motif_val
        else:
            csv_reader = csv.reader(model_file, delimiter=' ')
            header = csv_reader.next()
            for line in csv_reader:
                motif = line[0].lower()
                motif_val = line[1]
                mut_motif_dict[motif.lower()] = motif_val

    num_theta_cols = 1
    if substitution_file is not None:
        num_theta_cols = NUM_NUCLEOTIDES + 1
        # Read substitution matrix
        sub_motif_dict = dict()
        with open(substitution_file, "r") as model_file:
            if wide_format:
                csv_reader = csv.reader(model_file, delimiter=',')
            else:
                csv_reader = csv.reader(model_file, delimiter=' ')
            # Assume header is ACGT
            header = csv_reader.next()
            for i in range(NUM_NUCLEOTIDES):
                header[i + 1] = header[i + 1].lower()

            for line in csv_reader:
                motif = line[0].lower()
                mutate_to_prop = {}
                for i in range(NUM_NUCLEOTIDES):
                    mutate_to_prop[header[i + 1]] = line[i + 1]
                sub_motif_dict[motif] = mutate_to_prop

    # Shazam is always a 5mer
    feat_gen = MotifFeatureGenerator(motif_len=5)
    motif_list = feat_gen.motif_list
    # Reconstruct theta in the right order
    theta = np.zeros((feat_gen.feature_vec_len, num_theta_cols))
    for motif_idx, motif in enumerate(motif_list):
        theta[motif_idx, 0] = read_shmulate_val(mut_motif_dict[motif])
        if num_theta_cols > 1:
            for nuc in NUCLEOTIDES:
                theta[motif_idx, NUCLEOTIDE_DICT[nuc] + 1] = read_shmulate_val(
                    sub_motif_dict[motif][nuc])

    return theta
    def __init__(self,
                 motif_lens,
                 model_truncation=None,
                 left_motif_flank_len_list=None,
                 feats_to_remove=None):
        """
        @param motif_lens: list of odd-numbered motif lengths
        @param model_truncation: ModelTruncation object
        @param left_motif_flank_len_list: list of lengths of left motif flank; 0 will mutate the leftmost position, 1 the next to left, etc.
        @param feats_to_remove: list of features to remove if a model has not been fit yet
        """

        self.model_truncation = model_truncation
        self.feats_to_remove = model_truncation.feats_to_remove if model_truncation is not None else []
        if feats_to_remove is not None:
            self.feats_to_remove += feats_to_remove

        self.motif_lens = motif_lens

        if left_motif_flank_len_list is None:
            # default to central base mutating
            left_motif_flank_len_list = []
            for motif_len in motif_lens:
                left_motif_flank_len_list.append([motif_len / 2])
        else:
            # make sure we're actually making a hierarchical model
            for left_motif_lens, motif_len in zip(left_motif_flank_len_list,
                                                  motif_lens):
                for left_motif_len in left_motif_lens:
                    assert (left_motif_len in range(motif_len))

        self.max_motif_len = max(motif_lens)
        # We must have motifs nested within each other for this hierarchical motif feature generator
        self.motif_len = self.max_motif_len
        self.left_motif_flank_len = get_max_mut_pos(motif_lens,
                                                    left_motif_flank_len_list)

        # Find the maximum left and right motif flank lengths to pass to MotifFeatureGenerator
        # in order to update all the relevant features
        all_right_flanks = [m - flank_len - 1 \
                for m, flank_len_list in zip(motif_lens, left_motif_flank_len_list) \
                for flank_len in flank_len_list]
        self.max_left_motif_flank_len = max(sum(left_motif_flank_len_list, []))
        self.max_right_motif_flank_len = max(all_right_flanks)

        self.left_update_region = self.max_left_motif_flank_len
        self.right_update_region = self.max_right_motif_flank_len

        # Create list of feature generators for different motif lengths and different flank lengths
        self.feat_gens = []
        for motif_len, left_motif_flank_lens in zip(motif_lens,
                                                    left_motif_flank_len_list):
            for left_motif_flank_len in left_motif_flank_lens:
                self.feat_gens.append(
                    MotifFeatureGenerator(
                        motif_len=motif_len,
                        distance_to_start_of_motif=-left_motif_flank_len,
                        flank_len_offset=self.max_left_motif_flank_len -
                        left_motif_flank_len,
                    ))

        self.update_feats_after_removing(self.feats_to_remove)
    def combine_thetas_and_get_conf_int(self,
                                        theta,
                                        variance_est=None,
                                        col_idx=0,
                                        zstat=ZSCORE_95,
                                        add_targets=True):
        """
        Combine hierarchical and offset theta values
        """
        full_feat_generator = MotifFeatureGenerator(
            motif_len=self.motif_len,
            distance_to_start_of_motif=-self.max_left_motif_flank_len,
        )
        full_theta_size = full_feat_generator.feature_vec_len
        zero_theta_mask = self.model_truncation.zero_theta_mask_refit if self.model_truncation is not None else np.ones(
            theta.shape, dtype=bool)
        assert theta.shape[0] == self.feature_vec_len
        possible_theta_mask = self.get_possible_motifs_to_targets(
            zero_theta_mask.shape)
        theta_idx_counter = create_theta_idx_mask(zero_theta_mask,
                                                  possible_theta_mask)
        # stores which hierarchical theta values were used to construct the full theta
        # important for calculating covariance
        theta_index_matches = {i: [] for i in range(full_theta_size)}

        full_theta = np.zeros(full_theta_size)
        theta_lower = np.zeros(full_theta_size)
        theta_upper = np.zeros(full_theta_size)

        for i, feat_gen in enumerate(self.feat_gens):
            for m_idx, m in enumerate(feat_gen.motif_list):
                raw_theta_idx = self.feat_offsets[i] + m_idx

                if col_idx != 0 and add_targets:
                    m_theta = theta[raw_theta_idx, 0] + theta[raw_theta_idx,
                                                              col_idx]
                else:
                    m_theta = theta[raw_theta_idx, col_idx]

                if feat_gen.motif_len == full_feat_generator.motif_len:
                    assert (full_feat_generator.distance_to_start_of_motif ==
                            feat_gen.distance_to_start_of_motif)
                    assert (self.max_left_motif_flank_len ==
                            -feat_gen.distance_to_start_of_motif)
                    # Already at maximum motif length, so nothing to combine
                    full_m_idx = full_feat_generator.motif_dict[m]
                    full_theta[full_m_idx] += m_theta

                    if theta_idx_counter[raw_theta_idx, 0] != -1:
                        theta_index_matches[full_m_idx].append(
                            theta_idx_counter[raw_theta_idx, 0])
                    if col_idx != 0 and theta_idx_counter[raw_theta_idx,
                                                          col_idx] != -1:
                        theta_index_matches[full_m_idx].append(
                            theta_idx_counter[raw_theta_idx, col_idx])
                else:
                    # Combine hierarchical feat_gens for given left_motif_len
                    flanks = itertools.product(
                        NUCLEOTIDE_SET,
                        repeat=full_feat_generator.motif_len -
                        feat_gen.motif_len)
                    for f in flanks:
                        full_m = "".join(
                            f[:feat_gen.flank_len_offset]) + m + "".join(
                                f[feat_gen.flank_len_offset:])
                        full_m_idx = full_feat_generator.motif_dict[full_m]
                        full_theta[full_m_idx] += m_theta

                        if theta_idx_counter[raw_theta_idx, 0] != -1:
                            theta_index_matches[full_m_idx].append(
                                theta_idx_counter[raw_theta_idx, 0])
                        if col_idx != 0 and theta_idx_counter[raw_theta_idx,
                                                              col_idx] != -1:
                            theta_index_matches[full_m_idx].append(
                                theta_idx_counter[raw_theta_idx, col_idx])

        if variance_est is not None:
            # Make the aggregation matrix
            agg_matrix = np.zeros(
                (full_theta.size, np.max(theta_idx_counter) + 1))
            for full_theta_idx, matches in theta_index_matches.iteritems():
                agg_matrix[full_theta_idx, matches] = 1

            # Try two estimates of the obsersed information matrix
            cov_mat_full = np.dot(np.dot(agg_matrix, variance_est),
                                  agg_matrix.T)
            if np.any(np.diag(cov_mat_full) < 0):
                raise ValueError(
                    "Some variance estimates were negative: %d neg var, %s" %
                    (np.sum(np.diag(cov_mat_full) < 0), np.diag(cov_mat_full)))

            full_std_err = np.sqrt(np.diag(cov_mat_full))
            theta_lower = full_theta - zstat * full_std_err
            theta_upper = full_theta + zstat * full_std_err

        return full_theta, theta_lower, theta_upper
    def test_update(self):
        motif_len = 3
        left_update = 1
        right_update = 1
        feat_generator = MotifFeatureGenerator(motif_len=motif_len)
        obs_seq_mut = ObservedSequenceMutations(
            start_seq="aattatgaatgc",
            end_seq="atgcaagatagc",
            motif_len=3,
        )
        feat_matrix = feat_generator.get_base_features(obs_seq_mut)
        obs_seq_mut.set_start_feats(feat_matrix)

        # Compare update to create feature vectors by changing the mutation order by one step
        # Shuffle last two positions
        new_order = obs_seq_mut.mutation_pos_dict.keys()
        new_order = new_order[0:-2] + [new_order[-1], new_order[-2]]
        ordered_seq_mut1 = ImputedSequenceMutations(obs_seq_mut, new_order)
        # Revert the sequence back two steps
        intermediate_seq = obs_seq_mut.end_seq
        intermediate_seq = (intermediate_seq[:new_order[-2]] +
                            obs_seq_mut.start_seq[new_order[-2]] +
                            intermediate_seq[new_order[-2] + 1:])
        intermediate_seq = (intermediate_seq[:new_order[-1]] +
                            obs_seq_mut.start_seq[new_order[-1]] +
                            intermediate_seq[new_order[-1] + 1:])
        flanked_seq = (obs_seq_mut.left_flank + intermediate_seq +
                       obs_seq_mut.right_flank)
        # create features - the slow version
        feat_mut_steps1 = feat_generator.create_for_mutation_steps(
            ordered_seq_mut1, left_update, right_update)
        # get the feature delta - the fast version
        first_mutation_feat, second_mut_step = feat_generator.get_shuffled_mutation_steps_delta(
            ordered_seq_mut1,
            update_step=obs_seq_mut.num_mutations - 2,
            flanked_seq=flanked_seq,
            already_mutated_pos=set(new_order[:obs_seq_mut.num_mutations - 2]),
            left_update_region=left_update,
            right_update_region=right_update,
        )
        self.assertEqual(first_mutation_feat, 14)
        self.assertEqual(feat_mut_steps1[-2].mutating_pos_feats, 14)
        self.assertEqual(second_mut_step.mutating_pos_feats, 0)
        self.assertEqual(feat_mut_steps1[-1].mutating_pos_feats, 0)

        # Compare update to create feature vectors by changing the mutation order by another step
        # Shuffle second to last with the third to last mutation positions
        flanked_seq = (flanked_seq[:motif_len / 2 + new_order[-3]] +
                       obs_seq_mut.start_seq[new_order[-3]] +
                       flanked_seq[motif_len / 2 + new_order[-3] + 1:])
        new_order = new_order[0:-3] + [
            new_order[-2], new_order[-3], new_order[-1]
        ]
        ordered_seq_mut2 = ImputedSequenceMutations(obs_seq_mut, new_order)

        # create features - the slow version
        feat_mut_steps2 = feat_generator.create_for_mutation_steps(
            ordered_seq_mut2, left_update, right_update)
        # get the feature delta - the fast version
        first_mutation_feat2, second_mut_step2 = feat_generator.get_shuffled_mutation_steps_delta(
            ordered_seq_mut2,
            update_step=obs_seq_mut.num_mutations - 3,
            flanked_seq=flanked_seq,
            already_mutated_pos=set(new_order[:obs_seq_mut.num_mutations - 3]),
            left_update_region=left_update,
            right_update_region=right_update,
        )
        self.assertEqual(first_mutation_feat2, 14)
        self.assertEqual(second_mut_step2.mutating_pos_feats, 14)
        self.assertEqual(second_mut_step2.neighbors_feat_old, {9: 57, 7: 3})
        self.assertEqual(second_mut_step2.neighbors_feat_new, {9: 9, 7: 0})
        self.assertEqual(second_mut_step2.neighbors_feat_old,
                         feat_mut_steps2[-2].neighbors_feat_old)
        self.assertEqual(second_mut_step2.neighbors_feat_new,
                         feat_mut_steps2[-2].neighbors_feat_new)
Beispiel #12
0
def main(args=sys.argv[1:]):
    MOTIF_LEN = 5

    args = parse_args()
    log.basicConfig(format="%(message)s",
                    filename=args.log_file,
                    level=log.DEBUG)

    # Call Rscript
    command = 'Rscript'
    script_file = 'R/fit_shmulate_model.R'

    cmd = [
        command, script_file, args.input_file, args.input_genes,
        args.model_pkl.replace(".pkl", "")
    ]
    print "Calling:", " ".join(cmd)
    res = subprocess.call(cmd)

    # Read in the results from the shmulate model-fitter
    feat_gen = MotifFeatureGenerator(motif_len=MOTIF_LEN)
    motif_list = feat_gen.motif_list
    # Read target matrix
    target_motif_dict = dict()
    with open(args.model_pkl.replace(".pkl", "_target.csv"),
              "r") as model_file:
        csv_reader = csv.reader(model_file)
        # Assume header is ACGT
        header = csv_reader.next()
        for i in range(NUM_NUCLEOTIDES):
            header[i + 1] = header[i + 1].lower()

        for line in csv_reader:
            motif = line[0].lower()
            mutate_to_prop = {}
            for i in range(NUM_NUCLEOTIDES):
                mutate_to_prop[header[i + 1]] = line[i + 1]
            target_motif_dict[motif] = mutate_to_prop

    # Read mutability matrix
    mut_motif_dict = dict()
    with open(args.model_pkl.replace(".pkl", "_mut.csv"), "r") as model_file:
        csv_reader = csv.reader(model_file)
        motifs = csv_reader.next()[1:]
        motif_vals = csv_reader.next()[1:]
        for motif, motif_val in zip(motifs, motif_vals):
            mut_motif_dict[motif.lower()] = motif_val

    # Read substitution matrix
    sub_motif_dict = dict()
    with open(args.model_pkl.replace(".pkl", "_sub.csv"), "r") as model_file:
        csv_reader = csv.reader(model_file)
        # Assume header is ACGT
        header = csv_reader.next()
        for i in range(NUM_NUCLEOTIDES):
            header[i + 1] = header[i + 1].lower()

        for line in csv_reader:
            motif = line[0].lower()
            mutate_to_prop = {}
            for i in range(NUM_NUCLEOTIDES):
                mutate_to_prop[header[i + 1]] = line[i + 1]
            sub_motif_dict[motif] = mutate_to_prop

    # Reconstruct theta in the right order
    # TODO: How do we compare the edge motifs?? What does shmulate even do with them?
    target_model_array = np.zeros((feat_gen.feature_vec_len, NUM_NUCLEOTIDES))
    mut_model_array = np.zeros((feat_gen.feature_vec_len, 1))
    sub_model_array = np.zeros((feat_gen.feature_vec_len, NUM_NUCLEOTIDES))
    for motif_idx, motif in enumerate(motif_list):
        mut_model_array[motif_idx] = read_shmulate_val(mut_motif_dict[motif])
        log.info("%s:%f" % (motif, mut_model_array[motif_idx]))
        for nuc in NUCLEOTIDES:
            target_model_array[motif_idx,
                               NUCLEOTIDE_DICT[nuc]] = read_shmulate_val(
                                   target_motif_dict[motif][nuc])
            sub_model_array[motif_idx,
                            NUCLEOTIDE_DICT[nuc]] = read_shmulate_val(
                                sub_motif_dict[motif][nuc])

    if args.center_median:
        if np.isfinite(np.median(mut_model_array)):
            mut_model_array -= np.median(mut_model_array)

    # keep mut_model_array in same position as mutabilities from fit_context
    pickle.dump((mut_model_array, (target_model_array, sub_model_array)),
                open(args.model_pkl, 'w'))
Beispiel #13
0
def process_model_json(fname):
    """
    Code to process a json file and output feature generators and other parameters
    @param fname: file name for json fileeturn feature generator, feat
        e.g.:
        [
            {
                'feature_type': 'motif',
                'motif_length': '3',
                'distances_from_motif_start': '-2,-1,0',
                'motifs_to_keep': '',
            },
            {
                'feature_type': 'motif',
                'motif_length': '5',
                'distances_from_motif_start': '-2',
                'motifs_to_keep': '',
            },
            {
                'feature_type': 'position',
                'breaks': '0,78,114,165,195,312,348',
                'labels': 'fwr,cdr,fwr,cdr,fwr,cdr',
            },
        ]

    @return feat_gens: list of feature generators to be used in CombinedFeatureGenerator
    @return feats_to_remove: list of feature labels to remove before fitting; also passed into CombinedFeatureGenerator
    @return flanks: dictionary of left_flank_len, right_flank_len and max_motif_len for data processing
    """
    feat_gens = []
    feats_to_remove = []
    flanks = {}
    with open(fname, 'r') as f:
        models = []
        # first process flanks, etc.
        for model in json.load(f):
            process_individual_model(model)
            models.append(model)

        flanks['left_flank_len'] = -min(
            min([left for model in models for left in model['lefts']]), 0)
        flanks['right_flank_len'] = max(
            max([right for model in models for right in model['rights']]), 0)
        flanks['max_motif_len'] = max(
            [model['motif_length'] for model in models])

        for model in models:
            if model['feature_type'] == 'motif':
                for distance_to_start in model['distances_from_motif_start']:
                    feat_gens.append(
                        MotifFeatureGenerator(
                            motif_len=model['motif_length'],
                            distance_to_start_of_motif=distance_to_start,
                            flank_len_offset=flanks['left_flank_len'] +
                            distance_to_start,
                        ))
                    if model['motifs_to_keep']:
                        feats_to_remove += [
                            feat_tuple
                            for feat_tuple in feat_gens[-1].feature_info_list
                            if feat_tuple[0] not in model['motifs_to_keep']
                        ]
            elif model['feature_type'] == 'position':
                feat_gens.append(
                    PositionFeatureGenerator(
                        breaks=model['breaks'],
                        labels=model['labels'],
                    ))

    return feat_gens, feats_to_remove, flanks