Ejemplo n.º 1
0
def make_pose_input(Arguments):
    '''
    Gather all the data required as input for (e)POSE derivation. 
    '''

    Mutations = get_mutations(
        Arguments
    )  #Dictionary of amino acid substitutions and binary pheontypes or endophenotypes
    Sequences = get_sequences(
        Arguments)  #Load the fasta formatted sequence file
    ReferenceGene = get_reference_gene(Arguments)  #Gene your scoring
    Identities = get_identities(
        ReferenceGene, Sequences,
        Arguments)  #%ID of all sequences relative to ref

    #Inititialize burial, and populate if called
    ResidueBurial = {}
    if Arguments.Structure:
        ResidueBurial = normalized_residue_burial(Arguments)

    #Inititialize annotate, and populate if called
    Annotation = {}
    if Arguments.Annotation: Annotation = get_annotation(Arguments)

    return Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation
Ejemplo n.º 2
0
def leave_some_out(Arguments):
    '''
   Divide up the mutations before sending off to cross validation. 
   '''

    Mutations = get_mutations(Arguments)

    CrossValidations = int(
        ceil(len(Mutations.keys()) / float(Arguments.LeaveSomeOut)))

    #For a standard POSE, the trick is keeping an as-balanced-as-possible spliting of the classes for each data split
    if Arguments.Mode == "POSE":
        Positive = [
            Mutation for Mutation, Phenotype in Mutations.items() if Phenotype
        ]
        Negative = [
            Mutation for Mutation, Phenotype in Mutations.items()
            if not Phenotype
        ]

        shuffle(
            Positive
        )  #Get rid of bias that MIGHT be inherent in the original order of the mutations
        #Split as evenly as possible among the cross-validations
        Positive = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \
                            for CrossValidation, Mutation in enumerate(range(CrossValidations))])

        shuffle(
            Negative
        )  #Get rid of bias that MIGHT be inherent in the original order of the mutations
        #Split as evenly as possible among the cross-validations
        Negative = dict([(CrossValidation, Negative[Mutation:len(Negative):CrossValidations]) \
                            for CrossValidation, Mutation in enumerate(range(CrossValidations))])

        #only for the leave ONE out case do we do cases and controls in series
        if Arguments.LeaveSomeOut == 1:
            if len(Positive) < len(Negative):
                Postive = dict(
                    zip(Positive.keys(), list(reversed(Positive.values()))))
            else:
                Negative = dict(
                    zip(Negative.keys(), list(reversed(Negative.values()))))

        HoldoutMutations = dict([(CrossValidation, Positive[CrossValidation] + Negative[CrossValidation]) \
                                     for CrossValidation in Positive.keys()])

    #For ePOSEs it is simpler. Everyone is endophenotype positive. We therefore only have one class.
    if Arguments.Mode == "ePOSE":
        Positive = Mutations.keys()
        shuffle(
            Positive
        )  #Get rid of bias that MIGHT be inherent in the original order of the mutations
        HoldoutMutations = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \
                                     for CrossValidation, Mutation in enumerate(range(CrossValidations))])

    cross_validation(HoldoutMutations, Arguments)

    return
Ejemplo n.º 3
0
def leave_some_out(Arguments):
   '''
   Divide up the mutations before sending off to cross validation. 
   '''

   Mutations = get_mutations(Arguments)

   CrossValidations = int(ceil(len(Mutations.keys())/float(Arguments.LeaveSomeOut)))

   #For a standard POSE, the trick is keeping an as-balanced-as-possible spliting of the classes for each data split
   if Arguments.Mode == "POSE":
      Positive = [Mutation for Mutation, Phenotype in Mutations.items() if Phenotype]
      Negative = [Mutation for Mutation, Phenotype in Mutations.items() if not Phenotype]
   
      shuffle(Positive) #Get rid of bias that MIGHT be inherent in the original order of the mutations
      #Split as evenly as possible among the cross-validations
      Positive = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \
                          for CrossValidation, Mutation in enumerate(range(CrossValidations))])
    
      shuffle(Negative) #Get rid of bias that MIGHT be inherent in the original order of the mutations
      #Split as evenly as possible among the cross-validations
      Negative = dict([(CrossValidation, Negative[Mutation:len(Negative):CrossValidations]) \
                          for CrossValidation, Mutation in enumerate(range(CrossValidations))])

      #only for the leave ONE out case do we do cases and controls in series
      if Arguments.LeaveSomeOut == 1:
         if len(Positive) < len(Negative):
            Postive = dict(zip(Positive.keys(), list(reversed(Positive.values()))))
         else:
            Negative = dict(zip(Negative.keys(), list(reversed(Negative.values()))))

      HoldoutMutations = dict([(CrossValidation, Positive[CrossValidation] + Negative[CrossValidation]) \
                                   for CrossValidation in Positive.keys()])

   #For ePOSEs it is simpler. Everyone is endophenotype positive. We therefore only have one class. 
   if Arguments.Mode == "ePOSE":
      Positive = Mutations.keys() 
      shuffle(Positive)  #Get rid of bias that MIGHT be inherent in the original order of the mutations
      HoldoutMutations = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \
                                   for CrossValidation, Mutation in enumerate(range(CrossValidations))])
      
   cross_validation(HoldoutMutations, Arguments)

   return
Ejemplo n.º 4
0
def make_pose_input(Arguments):
    '''
    Gather all the data required as input for (e)POSE derivation. 
    '''

    Mutations = get_mutations(Arguments) #Dictionary of amino acid substitutions and binary pheontypes or endophenotypes
    Sequences = get_sequences(Arguments) #Load the fasta formatted sequence file
    ReferenceGene = get_reference_gene(Arguments) #Gene your scoring
    Identities = get_identities(ReferenceGene, Sequences, Arguments) #%ID of all sequences relative to ref

    #Inititialize burial, and populate if called
    ResidueBurial = {}
    if Arguments.Structure: ResidueBurial = normalized_residue_burial(Arguments)

    #Inititialize annotate, and populate if called
    Annotation = {}
    if Arguments.Annotation: Annotation = get_annotation(Arguments)

    return Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation