Example #1
0
    def __initializeTrainData(self, frac_positives):
        k = self.window  # for brevity
        self.indelLocations = np.loadtxt(data_dir +
                                         "indelLocations21.txt").astype(int)
        lengthIndels = int(len(self.indelLocations) / 22) * 22
        num_negatives = int(
            int((1. / frac_positives - 1) * lengthIndels) / 22) * 22
        total_length = lengthIndels + num_negatives
        num_negatives_per_chrom = int(num_negatives / 22)
        lengthIndels_per_chrom = int(lengthIndels / 22)
        total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom
        dataset = np.zeros((total_length, 2 * k + 1, 4))
        coverageDataset = np.zeros((total_length, 2 * k + 1))
        entropyDataset = np.zeros((total_length, 2 * k + 1))
        indices = np.zeros(total_length, dtype=np.uint32)
        nearby_indels = np.zeros(total_length, dtype=np.uint32)
        if self.triclass:
            labeltype = np.uint8
        else:
            labeltype = np.bool
        labels = np.zeros(total_length, dtype=labeltype)
        genome_positions = np.zeros(total_length, dtype=np.uint32)

        for chromosome in range(1, 23):
            self.referenceChr = self.referenceChrFull[str(chromosome)]
            self.refChrLen = len(self.referenceChr)
            ext = ".txt"
            if not self.include_filtered: ext = "_filtered" + ext
            if self.triclass:
                self.insertionLocations = np.loadtxt(
                    data_dir + "indelLocations{}_ins".format(chromosome) +
                    ext).astype(int)
                self.deletionLocations = np.loadtxt(
                    data_dir + "indelLocations{}_del".format(chromosome) +
                    ext).astype(int)
                self.indelLocationsFull = np.concatenate(
                    (self.insertionLocations, self.deletionLocations))
                self.insertLocations = np.random.choice(
                    self.insertLocations,
                    size=int(lengthIndels_per_chrom / 2),
                    replace=False)
                self.deletionLocations = np.random.choice(
                    self.deletionLocations,
                    size=lengthIndels_per_chrom -
                    int(lengthIndels_per_chrom / 2),
                    replace=False)
                self.indelLocations = np.concatenate(
                    (self.insertionLocations, self.deletionLocations))
                self.indelLocations = self.indelLocations - self.offset
            else:
                self.indelLocationsFull = np.loadtxt(
                    data_dir + "indelLocations{}".format(chromosome) +
                    ext).astype(int)
                self.indelLocations = np.random.choice(
                    self.indelLocationsFull,
                    size=lengthIndels_per_chrom,
                    replace=False)
                self.indelLocations = self.indelLocations - self.offset
            self.nonzeroLocationsRef = np.where(
                np.any(self.referenceChr != 0, axis=1))[0]
            if self.nearby:
                self.zeroLocationsRef = np.where(
                    np.all(self.referenceChr == 0, axis=1))[0]
                self.setOfZeroLocations = set(self.zeroLocationsRef)
            self.coverage = None
            if self.load_coverage:
                self.coverage = lc.load_coverage(
                    data_dir + "coverage/{}.npy".format(chromosome))
            self.setOfIndelLocations = set(self.indelLocations)
            self.prevChosenRefLocations = set()
            nearby_indels[total_length_per_chrom *
                          (chromosome - 1):total_length_per_chrom *
                          (chromosome - 1) +
                          lengthIndels_per_chrom] = self.indelLocations

            # dataset should have all the indels as well as random negative training samples
            if self.nearby:
                neg_positions = np.random.choice(self.indelLocations,
                                                 size=num_negatives_per_chrom)
                nearby_indels[total_length_per_chrom * (chromosome - 1) +
                              lengthIndels_per_chrom:total_length_per_chrom *
                              chromosome] = neg_positions
                offset = np.multiply(
                    np.random.randint(1,
                                      self.nearby + 1,
                                      size=num_negatives_per_chrom),
                    np.random.choice([-1, 1], size=num_negatives_per_chrom))
                neg_positions = neg_positions + offset  # locations that are offset from indels by some amount
            else:
                neg_positions = np.random.choice(self.nonzeroLocationsRef,
                                                 size=num_negatives_per_chrom)
                self.nearby_indels = neg_positions  # to prevent error if this is undefined
            for i in range(lengthIndels_per_chrom + num_negatives_per_chrom):
                if i < lengthIndels_per_chrom:
                    if not self.triclass:
                        label = 1  # standard binary classification labels
                    elif i < len(self.insertionLocations):
                        label = 1  # insertions will be labeled as 1
                    else:
                        label = 2  # deletions will be labeled as 2
                    pos = self.indelLocations[i]
                else:
                    label = 0
                    pos = neg_positions[i - lengthIndels_per_chrom]
                    if self.nearby:
                        niter = 0
                        while (pos in self.prevChosenRefLocations) or (
                                pos in self.setOfZeroLocations
                        ) or (pos
                              in self.setOfIndelLocations) and niter < 1001:
                            nearby_indels[total_length_per_chrom *
                                          (chromosome - 1) +
                                          i] = np.random.choice(
                                              self.indelLocations)
                            pos = nearby_indels[
                                total_length_per_chrom *
                                (chromosome - 1) + i] + np.random.randint(
                                    1, self.nearby + 1) * np.random.choice(
                                        [-1, 1])
                            niter += 1
                    else:
                        while (pos in self.prevChosenRefLocations) or (
                                pos in self.setOfIndelLocations):
                            pos = np.random.choice(self.nonzeroLocationsRef)
                    self.prevChosenRefLocations.add(pos)
                indices[total_length_per_chrom * (chromosome - 1) + i] = pos
                coverageWindow = np.zeros(2 * k + 1)
                # get k base pairs before and after the position
                window = self.referenceChr[pos - k:pos + k + 1]
                coverageWindow = None
                if self.coverage is not None:
                    coverageWindow = utils.flatten(self.coverage[pos - k:pos +
                                                                 k + 1])
                dataset[total_length_per_chrom * (chromosome - 1) + i] = window
                coverageDataset[total_length_per_chrom * (chromosome - 1) +
                                i] = coverageWindow
                labels[total_length_per_chrom * (chromosome - 1) + i] = label
                genome_positions[total_length_per_chrom * (chromosome - 1) +
                                 i] = pos
        if self.load_entropy:
            entropyDataset[:, k + 1:2 * k + 1] = entropy.entropyVector(dataset)
        rawZipped = zip(list(dataset), list(coverageDataset), list(labels),
                        list(genome_positions), list(indices),
                        list(nearby_indels), list(entropyDataset))
        # Shuffle the list
        np.random.shuffle(rawZipped)
        a, b, c, d, e, f, g = zip(*rawZipped)
        dataset = np.array(a)
        coverageDataset = np.array(b)
        entropyDataset = np.array(g)
        labels = np.array(c, dtype=labeltype)
        genome_positions = np.array(d, dtype=np.uint32)
        self.indices = np.array(e, dtype=np.uint32)
        self.nearby_indels = np.array(f, dtype=np.uint32)
        self.dataset = dataset
        self.coverageDataset = coverageDataset
        self.entropyDataset = entropyDataset
        if self.triclass:
            self.labels = utils.to_onehot(labels, 3)
        else:
            self.labels = np.expand_dims(labels, axis=1)
        self.genome_positions = genome_positions
        self.num_train_examples = int(
            round(total_length * (1 - self.test_frac)))
        self.ordering = list(range(0, self.num_train_examples))
Example #2
0
  def __initializeTrainData(self, frac_positives):
    ##
    # for brevity
    k = self.window
    # The window size used to compute sequence complexity
    k_seq_complexity = 20
    # We use chromosomes 2-22, we won't use chromosome 1 until the very end
    num_chrom_used = 21
    ##
    # Number of indels in the entire dataset used to train/test/val
    lengthIndels = 25000*num_chrom_used
    # Number of non-indels in the entire dataset
    num_negatives = int(int((1./frac_positives-1) * lengthIndels)/num_chrom_used)*num_chrom_used
    # Number of locations in the entire dataset
    total_length = lengthIndels + num_negatives
    ##
    # Number of indels in the entire dataset per chromosome
    num_negatives_per_chrom = int(num_negatives/num_chrom_used)
    # Number of non-indels in the entire dataset per chromosome
    lengthIndels_per_chrom = int(lengthIndels/num_chrom_used)
    # Number of locations in the entire dataset per chromosome
    total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom
    ##
    # one-hot encoded sequences of size 2*k + 1 around each location
    dataset = np.zeros((total_length, 2*k + 1, 4))
    # coverage corresponding to each location in the dataset
    coverageDataset = np.zeros((total_length, 2*k + 1))
    # entropy of expanding windows in the dataset
    entropyDataset = np.zeros((total_length, 2*k + 1))
    # indices on the genome of the locations in the dataset
    indices = np.zeros(total_length, dtype=np.uint32)
    # allele count values for indels, 0 for non-indels
    allele_count = np.zeros(total_length, dtype=np.uint32)
    nearby_indels = np.zeros(total_length, dtype=np.uint32)
    # label is either a bool or an int depending on the number of classes
    if self.triclass:
      labeltype = np.uint8
    else:
      labeltype = np.bool
    # 0 for non-indels 1 (and 2) in case of indels
    labels = np.zeros(total_length, dtype=labeltype)
    # seems to be the same as indices, ToDo does it neet to be there???
    genome_positions = np.zeros(total_length, dtype=np.uint32)
    # the chromosome number corresponding to each location
    chrom_num = np.zeros(total_length, dtype=np.uint32)
    # Test the number of indels in a non-indel window, as well as multiple indel in a single indel window
    num_indel_neg_set = 0
    num_indel_pos_set = 0

    # Load data from chromosomes 2-22
    # populate dataset and related variables per chromosome
    for chromosome in range(2, 23):
      ##
      # Load the chromosome from the full genome
      referenceChr = self.referenceChrFull[str(chromosome)]
      ## Load and process the positive (indels) dataset
      # This is a 4 column data: indel locations, allele count, filter value, insertion (1) or deletion (0)
      indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(chromosome) + ".npy")
      indel_indices_set = set(np.array(indel_data_load[:, 0], dtype = int))
      indel_data_load = indel_data_load[indel_data_load[:, 0] + k < referenceChr.shape[0]]
      # Remove those that have complexity below the threshold
      indel_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
      indel_sequence_indices = np.repeat(indel_sequence_indices, indel_data_load.shape[0], axis = 0)
      indel_sequence_indices = np.reshape(indel_sequence_indices, [-1, indel_data_load.shape[0]])
      indel_sequence_indices += np.transpose(np.array(indel_data_load[:, 0], dtype = int))
      indel_sequence_complexity = entropy.entropySequence(referenceChr[indel_sequence_indices.transpose(), :])
      del indel_sequence_indices
      # Filter by sequence complexity and filter value around 20 sized window and complexity threshold
      total_indices = np.arange(indel_data_load.shape[0])
      filtered_indices = np.logical_and(indel_data_load[:, 2] == 1, indel_sequence_complexity >= self.complexity_threshold)
      # Add an additional filter for allele count = 1
      filtered_indices = np.logical_and(indel_data_load[:, 1] == 1, filtered_indices)

      # Sample the indels, taking into consideration the classification problem in hand
      if self.triclass:
        filtered_indices_insert = np.logical_and(indel_data_load.iloc[:, 3] == 1, filtered_indices)
        filtered_indices_insert = total_indices[filtered_indices_insert]
        filtered_indices_delete = np.logical_and(indel_data_load.iloc[:, 3] == 0, filtered_indices)
        filtered_indices_delete = total_indices[filtered_indices_delete]
        insertionLocations = np.random.choice(filtered_indices_insert, size = int(lengthIndels_per_chrom/2), replace = False)
        deletionLocations = np.random.choice(filtered_indices_delete, size = lengthIndels_per_chrom - int(lengthIndels_per_chrom/2), replace = False)
        indel_indices = np.concatenate((insertionLocations, deletionLocations))
        del filtered_indices_insert, filtered_indices_delete, insertionLocations, deletionLocations
      else:
        filtered_indices = total_indices[filtered_indices]
        indel_indices = np.random.choice(filtered_indices, size = lengthIndels_per_chrom, replace = False)
      ##
      indelLocations = np.array(indel_data_load[indel_indices, 0], dtype = int)
      allele_count_val = indel_data_load[indel_indices, 1]
      del indel_data_load, indel_indices, filtered_indices, total_indices
      indelLocations = indelLocations - self.offset

      ## Load the coverage data if needed
      coverage = None
      if self.load_coverage:
        coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome))

      ## Create the negative dataset
      rel_size_neg_large = 2
      neg_positions_large = np.load(data_dir + "nonindelLocationsSampled" + str(chromosome) + '.npy')
      neg_positions_large = np.random.choice(neg_positions_large, size = rel_size_neg_large*num_negatives_per_chrom, replace = False)
      # Remove those that have complexity below the threshold
      neg_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
      neg_sequence_indices = np.repeat(neg_sequence_indices, len(neg_positions_large), axis = 0)
      neg_sequence_indices = np.reshape(neg_sequence_indices, [-1, len(neg_positions_large)])
      neg_sequence_indices += np.transpose(neg_positions_large)
      neg_sequence_complexity = entropy.entropySequence(referenceChr[neg_sequence_indices.transpose(), :])
      neg_positions_large = neg_positions_large[neg_sequence_complexity >= self.complexity_threshold]
      del neg_sequence_indices, neg_sequence_complexity
      ##
      if self.nearby:
        # Create a list of all permissible nearby locations
        nearby_locations = np.arange(-self.nearby, self.nearby + 1)
        nearby_locations = np.repeat(nearby_locations, len(indelLocations), axis = 0)
        nearby_locations = np.reshape(nearby_locations, [-1, len(indelLocations)])
        nearby_locations += np.transpose(indelLocations)
        nearby_locations = np.reshape(nearby_locations, -1)
        # Remove all indel locations and low-complexity non-indel locations from nearby locations
        nearby_locations = np.array((set(nearby_locations) - set(indelLocationsFull)) & set(neg_positions_large))
        if len(nearby_locations) >= num_negatives_per_chrom:
          neg_positions = np.random.choice(nearby_locations, size = num_negatives_per_chrom, replace = False)
        else:
          # Else sample the remaining from the negative positions- this is the best that can be done, try increasing the nearby size
          print "Try increasing nearby or rel_size_neg_large. Not enough nearby-non-indels could be sampled in chromosome {}".format(chromosome)
          num_neg_needed = num_negatives_per_chrom - len(nearby_locations)
          not_nearby = np.random.choice(list((set(neg_positions_large) - set(indelLocationsFull)) - set(nearby_locations)), size = num_neg_needed, replace = False)
          neg_positions = np.concatenate((nearby_locations, not_nearby))
      else:
        neg_positions = np.random.choice(neg_positions_large, size = num_negatives_per_chrom, replace = False)

      for i in range(lengthIndels_per_chrom + num_negatives_per_chrom):
        if i < lengthIndels_per_chrom:
          if not self.triclass:
            label = 1 # standard binary classification labels
          elif i < int(lengthIndels_per_chrom/2):
            label = 1 # insertions will be labeled as 1
          else:
            label = 2 # deletions will be labeled as 2
          pos = indelLocations[i]
          allele_count[total_length_per_chrom*(chromosome - 2) + i] = allele_count_val[i]
          num_indel_pos_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
        else:
          label = 0
          pos = neg_positions[i - lengthIndels_per_chrom]
          # Compute the true value of nearby_indels TODO
          #if self.nearby:
          num_indel_neg_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
        indices[total_length_per_chrom*(chromosome - 2) + i] = pos
        coverageWindow = np.zeros(2*k + 1)
        # get k base pairs before and after the position
        window = referenceChr[pos - k : pos + k + 1]
        if coverage is not None:
          coverageWindow += np.mean(utils.flatten(coverage[pos - k : pos + k + 1]))#= utils.flatten(coverage[pos - k : pos + k + 1])
        dataset[total_length_per_chrom*(chromosome - 2) + i] = window
        coverageDataset[total_length_per_chrom*(chromosome - 2) + i] = coverageWindow
        labels[total_length_per_chrom*(chromosome - 2) + i] = label
        genome_positions[total_length_per_chrom*(chromosome - 2) + i] = pos
        chrom_num[total_length_per_chrom*(chromosome - 2) + i] = chromosome
    if self.load_entropy:
      entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset)
    ##
    # Randomly choose the validation and test chromosome
    self.val_chrom, self.test_chrom = np.random.choice(range(2, 23), 2, replace=False)
    # Set the number of training examples, and the respective set indices
    total_indices = np.arange(total_length)
    self.num_train_examples = total_length_per_chrom*(num_chrom_used - 2)
    self.train_indices = total_indices[np.logical_and(chrom_num != self.val_chrom, chrom_num != self.test_chrom)]
    self.test_indices = total_indices[chrom_num == self.test_chrom]
    self.val_indices = total_indices[chrom_num == self.val_chrom]
    ##
    # Set the respective variables
    self.dataset = dataset
    self.coverageDataset = coverageDataset
    self.entropyDataset = entropyDataset
    self.indices = indices
    self.allele_count = allele_count
    self.nearby_indels = nearby_indels
    self.genome_positions = genome_positions
    if self.triclass:
      self.labels = utils.to_onehot(labels, 3)
    else:
      self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience)
    del dataset, coverageDataset, entropyDataset, indices, allele_count, nearby_indels, genome_positions, labels
    print num_indel_pos_set
    print float(num_indel_pos_set)/lengthIndels
    print num_indel_neg_set
    print float(num_indel_neg_set)/num_negatives
    print np.mean(np.mean(self.coverageDataset, axis = 1))
    print np.mean(np.var(self.coverageDataset, axis = 1))
Example #3
0
  def __initializeTrainData(self, frac_positives):
    k = self.window # for brevity
    lengthIndels = len(self.indelLocations) # Total number of indels
    num_negatives = int((1./frac_positives-1) * lengthIndels) # Total number of negative training examples we need, based on the desired fraction of positive examples
    total_length = lengthIndels + num_negatives # Total number of examples [both training and testing!]
    dataset = np.zeros((total_length, 2*k + 1, 4))
    coverageDataset = np.zeros((total_length, 2*k + 1))
    entropyDataset = np.zeros((total_length, 2*k + 1))
    recombinationDataset = np.zeros((total_length, 1))
    #recombinationDataset = np.zeros((total_length, 2*k + 1))
    if self.triclass:
      labeltype = np.uint8 # Three distinct labels in this case
    else:
      labeltype = np.bool
    labels = np.zeros(total_length, dtype=labeltype)
    genome_positions = np.zeros(total_length, dtype=np.uint32)

    # dataset should have all the indels as well as random negative training samples
    if self.nearby:
      neg_positions = np.random.choice(self.indelLocations, size=num_negatives) # First choose a random number of examples among known indels
      self.nearby_indels = neg_positions # Store the locations of these selected indels
      offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives)) # Offset by a random nonzero amount <= to self.nearby
      neg_positions = neg_positions + offset # These locations that are offset from indels by some amount are [roughly] our negative examples; but see for loop below
    else:
      neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives) # Select random nonzero locations from the reference genomes
      self.nearby_indels = neg_positions # to prevent error if this is undefined (value should not be used as it is meaningless in this case)
    self.indices = neg_positions # Locations of the negative training examples
    for i in range(lengthIndels + num_negatives): # Loop over all examples
      if i < lengthIndels: # Positive example
        if not self.triclass:
          label = 1 # standard binary classification labels
        elif i < len(self.insertionLocations):
          label = 1 # insertions will be labeled as 1
        else:
          label = 2 # deletions will be labeled as 2
        pos = self.indelLocations[i]
      else: # Negative example (not an indel)
        label = 0
        pos = neg_positions[i - lengthIndels] # Get corresponding entry of neg_positions, which stores the tentative positions of all negative examples. However, we may need to update this position. We still predefine them and update if needed simply for efficiency's sake.
        if self.nearby: # Position must be near a known indel
          niter = 0 # Avoid infinite loops (probably should still make sure the selected position is not at an indel location (or zero?) regardless of # iterations in the below condition, though)
          while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001:
            # Avoid choosing an already selected position, a zero location (unknown reference base), or an actual indel
            self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations) # Select again using the same procedure, until we get a valid negative example
            pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1])
            niter += 1
        else: # Position simply just has to not be previously selected, and not a positive (i.e. indel) example
          while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations):
            pos = np.random.choice(self.nonzeroLocationsRef)
        self.indices[i - lengthIndels] = pos # True position of the negative example
        self.prevChosenRefLocations.add(pos) # Store this position, so we don't reuse it
      # get the k base pairs before and after the position, and the position itself
      window = self.referenceChr[pos - k : pos + k + 1]
      coverageWindow = None # Coverage window corresponding to the input base pairs (loaded only if necessary)
      if self.coverage is not None:
        coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1])
      recombWindowAverage = None
      if self.recombination is not None: # Recombination window, if needed
        recombWindow = np.zeros((2*k + 1, 1))
        recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1))
        recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))]
        recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds]
        recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))]
        recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] 
        recombWindowAverage = np.mean(recombWindow)
        #recombWindowAverage = utils.flatten(recombWindow)
      dataset[i] = window # Store the data for this example in the overall data structure
      coverageDataset[i] = coverageWindow
      recombinationDataset[i] = recombWindowAverage
      labels[i] = label
      genome_positions[i] = pos # This might be the same as self.indices?
    self.indices = np.concatenate((self.indelLocations, self.indices)) # Indices for positive examples are simply in self.indelLocations
    self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels)) # "Nearby" indel for a positive example is the indel itself
    if self.load_entropy:
      entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset) # Create the entropy vectors, if needed
    rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset))
    # Shuffle the data
    np.random.shuffle(rawZipped)
    a, b, c, d, e, f, g, h = zip(*rawZipped)
    dataset = np.array(a)
    coverageDataset = np.array(b)
    entropyDataset = np.array(g)
    recombinationDataset = np.array(h)
    labels = np.array(c, dtype=labeltype)
    genome_positions = np.array(d, dtype=np.uint32)
    self.indices = np.array(e, dtype=np.uint32)
    self.nearby_indels = np.array(f, dtype=np.uint32)
    self.dataset = dataset
    self.coverageDataset = coverageDataset
    self.entropyDataset = entropyDataset
    self.recombinationDataset = recombinationDataset
    if self.triclass:
      self.labels = utils.to_onehot(labels, 3)
    else:
      self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience)
    self.genome_positions = genome_positions
    self.num_train_examples = int(round(total_length * (1-self.test_frac))) # Number of examples to use for training (as opposed to testing)
    self.ordering = list(range(0, self.num_train_examples)) # Order in which we go through the training examples (will be changed)
Example #4
0
    print_every = 100  # print accuracy every 100 steps


config = Config()
loader = load_full_dataset_sample_per_chrom.DatasetLoader(
    windowSize=config.window,
    batchSize=config.batch_size,
    testBatchSize=config.test_batch_size,
    seed=1,
    load_coverage=False,
    complexity_threshold=1.2,
    pos_frac=0.5)

datset = loader.dataset
labls = utils.flatten(loader.labels)
entropyMatrix = entropy.entropyVector(datset)
freq_count, freq_matrix = sequence_analysis.sequence_2_mer_generate(datset)
print("Validation Chromosome: {}".format(loader.val_chrom))
print("Test Chromosome: {}".format(loader.test_chrom))
print "Entropy Model"
log_reg_model_entrpy = entropy.logisticRegression(
    entropyMatrix,
    labls,
    loader.train_indices,
    loader.test_indices,
    testAC=loader.allele_count[loader.test_indices])
print "Frequency Model"
log_reg_model_freq = sequence_analysis.logistic_regression_2_mer(
    freq_matrix, labls, loader.train_indices, loader.test_indices)
'''
loader.load_chromosome_window_data(loader.val_chrom)
Example #5
0
        seq_val = seq_ch
    else:
        seq.extend(seq_ch)
    del seq_ch

del reference, referenceChr

order = [x for x in range(len(seq))]
random.shuffle(order)
seq = np.array(
    [seq[i] for i in order]
)  # Shuffle the training data, so we can easily choose a random subset for testing
num_indels = np.array([num_indels[i] for i in order])

x_train = np.array(seq[:num_train_ex])
x_train = entropy.entropyVector(x_train)
y_train = np.array(num_indels[:num_train_ex])
x_test = np.array(seq_val)
x_test = entropy.entropyVector(x_test)
y_test = np.array(num_indels_val)

#np.save(data_dir + 'RegrKerasTestSeq' + str(validation_chrom) + str(complexity_thresh) +  '.npy', x_test)
np.save(data_dir + 'RegrKerasEntropyTestLab' + str(validation_chrom) + '.npy',
        y_test)

print('Mean # indels per window: {}'.format(
    float(sum(y_train)) / len(y_train)))

import keras
from keras.regularizers import l2
from keras.layers import Conv1D, Dense, Flatten, Dropout
Example #6
0
	def __initializeTrainData(self, frac_positives):
		k = self.window # for brevity
		lengthIndels = len(self.indelLocations)
		num_negatives = int((1./frac_positives-1) * lengthIndels)
		total_length = lengthIndels + num_negatives
		dataset = np.zeros((total_length, 2*k + 1, 4))
		coverageDataset = np.zeros((total_length, 2*k + 1))
		entropyDataset = np.zeros((total_length, 2*k + 1))
                recombinationDataset = np.zeros((total_length, 1))
	        #recombinationDataset= np.zeros((total_length, 2*k + 1))
		if self.triclass:
		  labeltype = np.uint8
		else:
		  labeltype = np.bool
		labels = np.zeros(total_length, dtype=labeltype)
		genome_positions = np.zeros(total_length, dtype=np.uint32)
		num_negatives = int((1./frac_positives-1) * lengthIndels)

		# dataset should have all the indels as well as random negative training samples
		if self.nearby:
		  neg_positions = np.random.choice(self.indelLocations, size=num_negatives)
		  self.nearby_indels = neg_positions
		  offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives))
		  neg_positions = neg_positions + offset # locations that are offset from indels by some amount
		  self.indices = neg_positions
		else:
		  neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives)
		  self.indices = neg_positions
		  self.nearby_indels = neg_positions # to prevent error if this is undefined
		for i in range(lengthIndels + num_negatives):
			if i < lengthIndels:
				if not self.triclass:
				  label = 1 # standard binary classification labels
				elif i < len(self.insertionLocations):
				  label = 1 # insertions will be labeled as 1
				else:
				  label = 2 # deletions will be labeled as 2
				pos = self.indelLocations[i]
			else:
				label = 0
				pos = neg_positions[i - lengthIndels]
				if self.nearby:
				  niter = 0
				  while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001:
					self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations)
					pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1])
					niter += 1
				else:
				  while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations):
					pos = np.random.choice(self.nonzeroLocationsRef)
				self.indices[i - lengthIndels] = pos
				self.prevChosenRefLocations.add(pos)
			coverageWindow = np.zeros(2*k + 1)
			# get k base pairs before and after the position
			window = self.referenceChr[pos - k : pos + k + 1]
			coverageWindow = None
			if self.coverage is not None:
				coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1])
			recombWindowAverage = None
                        if self.recombination is not None:
                                recombWindow = np.zeros((2*k + 1, 1))
                                recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1))
                                recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))]
                                recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds]
                                recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))]
                                recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] 
                        	recombWindowAverage = np.mean(recombWindow)
				#recombWindowAverage = utils.flatten(recombWindow)
                        dataset[i] = window
			coverageDataset[i] = coverageWindow
                        recombinationDataset[i] = recombWindowAverage
			labels[i] = label
			genome_positions[i] = pos
		self.indices = np.concatenate((self.indelLocations, self.indices))
		self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels))
		if self.load_entropy:
			entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset)
		rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset))
		# Shuffle the list
		np.random.shuffle(rawZipped)
		a, b, c, d, e, f, g, h = zip(*rawZipped)
		dataset = np.array(a)
		coverageDataset = np.array(b)
		entropyDataset = np.array(g)
                recombinationDataset = np.array(h)
		labels = np.array(c, dtype=labeltype)
		genome_positions = np.array(d, dtype=np.uint32)
		self.indices = np.array(e, dtype=np.uint32)
		self.nearby_indels = np.array(f, dtype=np.uint32)
		self.dataset = dataset
		self.coverageDataset = coverageDataset
		self.entropyDataset = entropyDataset
                self.recombinationDataset = recombinationDataset
		if self.triclass:
		  self.labels = utils.to_onehot(labels, 3)
		else:
		  self.labels = np.expand_dims(labels, axis=1)
		self.genome_positions = genome_positions
		self.num_train_examples = int(round(total_length * (1-self.test_frac)))
		self.ordering = list(range(0, self.num_train_examples))