Ejemplo n.º 1
0
def parser(variant_string, variant): 
    # for amino acids only the 3 char is acceptable which is the preferred method
    # according to HGVS. eg: For Tryptophan only 'Tyr' is acceptable, not 'Y'

    # since we have already got the v_type, set to position 2 after the period
    i = 2
    
    # --- Position/ ---
    # ignore the '[' or '('
    if variant_string[i] == '[' or variant_string[i] == '(':
        i += 1
        allele = True
    # if '?' unknown or '=' no change expected
    if variant_string[i] == '?' or variant_string[i] == '=':
        variant.position = variant_string[i]
        i += 1
        variant.operator_value = variant_string[i:]
    else:
        # get the position amino acid
        if (i+3) < len(variant_string):
            # get the first amino acid
            if variant_string[i] == '*':
                first_amino_acid = '*'
            else:            
                first_amino_acid = variant_string[i:i+3]
                if not first_amino_acid.isalpha():
                    #check if first char is alphabet
                    first_amino_acid = variant_string[i]
                    i += 1
                else:
                    i += 3
                
            # assume that it is a position value for now
            variant.position = first_amino_acid

            # get the position of the amino acid, which will be stored
            # in the postion intron for now
            start_position = i
            while i < len(variant_string):
                if check_numeric_value(variant_string[i]):
                    i += 1
                else:
                    break

            # get the position value
            position = variant_string[start_position:i]
            
            # assume that it is a position intron value for now
            variant.position_intron = position  
            
            if len(variant_string) > i:
                if variant_string[i] == '_':
                    i += 1
                    if len(variant_string) > i:                    
                        # set the first amino acid to range lower
                        variant.position = ''
                        variant.range_lower = first_amino_acid;
                        # set postion intron value to range lower intron
                        variant.position_intron = ''
                        variant.range_lower_intron = position

                        # get the range upper or second amino acid value
                        if variant_string[i] == '*':
                            variant.range_upper = '*'
                        else:
                            if variant_string[i:i+3].isalpha():                            
                                variant.range_upper = variant_string[i:i+3]
                                i += 3
                            else:
                                varaint.range_upper = variant_string[i]
                                i += 1

                        # get the range upper intron
                        start_position = i
                        while i < len(variant_string):
                            if check_numeric_value(variant_string[i]):
                                i += 1
                            else:
                                break

                        variant.range_upper_intron = variant_string[start_position:i]
                
                # get the operator - the amino acid that has changed
                operator = variant_string[i:len(variant_string)]
                operator_value = ''
                
                if operator[0] == '*':
                    operator = '*'
                elif 'delins' in  operator.lower():
                    operator = 'delins'
                    operator_value = variant_string[i+6:len(variant_string)]
                elif variant_string[i] == '(':
                    operator = variant_string[i:len(variant_string)]
                    operator_value = ''
                elif operator.lower() in ('fs*', 'fsx', 'fster'):
                    f = operator.index('fs*')                    
                    operator_value = operator[f:]
                    operator = operator[0:f]
                else:
                    if variant_string[i:i+3].isalpha():                    
                        operator = variant_string[i:i+3]
                        operator_value = variant_string[i+3:len(variant_string)]
                    else:
                        operator = variant_string[i]
                        operator_value = variant_string[i+1:len(variant_string)]
                
                # set the operator
                variant.operator = operator
                variant.operator_value = operator_value

    return variant
Ejemplo n.º 2
0
def validate(variant):
    # Please not that since protein nomenclature is different to genomic/cDNA structure the values are
    # stored differently in the VariantName Class.
    # Since the protein positions and ranges contain the amino acid and numeric value for the index
    # of that amino acid the variant position, range_lower and range_upper  will store the amino acids
    # while the intron fields will store the index.

    # position: amino acid
    if variant.position.strip() != '':
        if variant.position not in ('?', '='):
            if not variant.position.lower() in ValidValues.amino_acids:
                if not variant.position.lower(
                ) in ValidValues.amino_acids_single:
                    return False
        else:
            return True

    # position: index of the amino acid
    if variant.position_intron.strip() != '':
        # if position has a value then intron should too
        if variant.position.strip() != '':
            if not check_numeric_value(variant.position_intron):
                return False
        else:
            return False

    # range_lower: amino acid
    if variant.range_lower.strip() != '':
        if variant.position.strip() == '':
            if not variant.range_lower.lower() in ValidValues.amino_acids:
                return False
        else:
            return False

    # range_lower: index of the amino acid
    if variant.range_lower_intron.strip() != '':
        if variant.range_lower.strip() != '':
            if not check_numeric_value(variant.range_lower_intron):
                return False
        else:
            return False

    # range_upper: amino acid
    if variant.range_upper.strip() != '':
        if variant.range_lower.strip() != '':
            if not variant.range_upper.lower() in ValidValues.amino_acids:
                return False
        else:
            return False

    # range_upper: index of the amino acid
    if variant.range_upper_intron.strip() != '':
        if variant.range_upper.strip() != '':
            if not check_numeric_value(variant.range_upper_intron):
                return False
        else:
            return False

    # Operator
    if variant.operator.strip() != '':
        # check for repeating range
        if not variant.operator[0] != '(' or not variant.operator[0] != '[':
            if not get_repeater_value(variant.operator):
                return False
        # check for indels --> 'delins' and insertions --> 'ins'
        elif variant.operator.lower() not in ValidValues.protein_operators:
            # check for amino acids
            if not variant.operator.lower() in ValidValues.amino_acids:
                if not variant.operator.lower(
                ) in ValidValues.amino_acids_single:
                    return False
    else:
        return False

    # Operator Value: should only contain amino acids for indel and insertion
    # operators.
    if variant.operator_value.strip() != '':
        # frameshifts
        if variant.operator_value[0:2] == 'fs':
            if len(variant.operator_value) > 2:
                if variant.operator_value[2].lower() in ('*', 'x'):
                    # check for '];['
                    if '];[' in variant.operator_value[3:]:
                        p = variant.operator_value.index('];[')
                        if variant.operator_value[3:p] != '':
                            if not variant.operator_value[3:p].isdigit():
                                return False
                    else:
                        if not variant.operator_value[3:].isdigit():
                            return False
                else:
                    return False
        else:
            # ignore if operator begins with ']' or ')'
            if ']' not in variant.operator_value and ')' not in variant.operator_value:
                # operator value can not be empty if the operator is an indel or insertion
                if variant.operator.lower(
                ) not in ValidValues.protein_operators:
                    # The length of operator string should be divisble by 3 since the amino acid
                    # codes should only be 3 chars long.
                    if not len(variant.operator_value) % 3 != 0:
                        return False
                    else:
                        # need to check each amino acid if valid
                        amino_acids = split(variant.operator_value.lower(), 3)
                        for amino_acid in amino_acids:
                            item_found = False
                            if amino_acid in ValidValues.amino_acids:
                                item_found = True
                            # if amino acid not found
                            if not item_found:
                                return False

    return True
Ejemplo n.º 3
0
def parser(variant_string, variant):
    # for amino acids only the 3 char is acceptable which is the preferred method
    # according to HGVS. eg: For Tryptophan only 'Tyr' is acceptable, not 'Y'

    # since we have already got the v_type, set to position 2 after the period
    i = 2

    # --- Position/ ---
    # ignore the '[' or '('
    if variant_string[i] == '[' or variant_string[i] == '(':
        i += 1
        allele = True
    # if '?' unknown or '=' no change expected
    if variant_string[i] == '?' or variant_string[i] == '=':
        variant.position = variant_string[i]
        i += 1
        variant.operator_value = variant_string[i:]
    else:
        # get the position amino acid
        if (i + 3) < len(variant_string):
            # get the first amino acid
            if variant_string[i] == '*':
                first_amino_acid = '*'
            else:
                first_amino_acid = variant_string[i:i + 3]
                if not first_amino_acid.isalpha():
                    #check if first char is alphabet
                    first_amino_acid = variant_string[i]
                    i += 1
                else:
                    i += 3

            # assume that it is a position value for now
            variant.position = first_amino_acid

            # get the position of the amino acid, which will be stored
            # in the postion intron for now
            start_position = i
            while i < len(variant_string):
                if check_numeric_value(variant_string[i]):
                    i += 1
                else:
                    break

            # get the position value
            position = variant_string[start_position:i]

            # assume that it is a position intron value for now
            variant.position_intron = position

            if len(variant_string) > i:
                if variant_string[i] == '_':
                    i += 1
                    if len(variant_string) > i:
                        # set the first amino acid to range lower
                        variant.position = ''
                        variant.range_lower = first_amino_acid
                        # set postion intron value to range lower intron
                        variant.position_intron = ''
                        variant.range_lower_intron = position

                        # get the range upper or second amino acid value
                        if variant_string[i] == '*':
                            variant.range_upper = '*'
                        else:
                            if variant_string[i:i + 3].isalpha():
                                variant.range_upper = variant_string[i:i + 3]
                                i += 3
                            else:
                                varaint.range_upper = variant_string[i]
                                i += 1

                        # get the range upper intron
                        start_position = i
                        while i < len(variant_string):
                            if check_numeric_value(variant_string[i]):
                                i += 1
                            else:
                                break

                        variant.range_upper_intron = variant_string[
                            start_position:i]

                # get the operator - the amino acid that has changed
                operator = variant_string[i:len(variant_string)]
                operator_value = ''

                if operator[0] == '*':
                    operator = '*'
                elif 'delins' in operator.lower():
                    operator = 'delins'
                    operator_value = variant_string[i + 6:len(variant_string)]
                elif variant_string[i] == '(':
                    operator = variant_string[i:len(variant_string)]
                    operator_value = ''
                elif operator.lower() in ('fs*', 'fsx', 'fster'):
                    f = operator.index('fs*')
                    operator_value = operator[f:]
                    operator = operator[0:f]
                else:
                    if variant_string[i:i + 3].isalpha():
                        operator = variant_string[i:i + 3]
                        operator_value = variant_string[i +
                                                        3:len(variant_string)]
                    else:
                        operator = variant_string[i]
                        operator_value = variant_string[i +
                                                        1:len(variant_string)]

                # set the operator
                variant.operator = operator
                variant.operator_value = operator_value

    return variant
def validate(variant): 
    # Please not that since protein nomenclature is different to genomic/cDNA structure the values are
    # stored differently in the VariantName Class.
    # Since the protein positions and ranges contain the amino acid and numeric value for the index
    # of that amino acid the variant position, range_lower and range_upper  will store the amino acids
    # while the intron fields will store the index.

    # position: amino acid
    if variant.position.strip() != '':
        if variant.position not in ('?', '='):
            if not variant.position.lower() in ValidValues.amino_acids:
                if not variant.position.lower() in ValidValues.amino_acids_single:
                    return False
        else:
            return True

    # position: index of the amino acid 
    if variant.position_intron.strip() != '':
        # if position has a value then intron should too
        if variant.position.strip() != '':
            if not check_numeric_value(variant.position_intron):
                return False
        else:
            return False

    # range_lower: amino acid
    if variant.range_lower.strip() != '':
        if variant.position.strip() == '':
            if not variant.range_lower.lower() in ValidValues.amino_acids:
                return False
        else:
            return False

    # range_lower: index of the amino acid
    if variant.range_lower_intron.strip() != '':
        if variant.range_lower.strip() != '':
            if not check_numeric_value(variant.range_lower_intron):
                return False
        else:
            return False

    # range_upper: amino acid
    if variant.range_upper.strip() != '':
        if variant.range_lower.strip() != '':
            if not variant.range_upper.lower() in ValidValues.amino_acids:
                return False
        else:
            return False

    # range_upper: index of the amino acid
    if variant.range_upper_intron.strip() != '':
        if variant.range_upper.strip() != '':
            if not check_numeric_value(variant.range_upper_intron):
                return False
        else:
            return False
    
    # Operator
    if variant.operator.strip() != '':
        # check for repeating range
        if not variant.operator[0] != '(' or not variant.operator[0] != '[':
            if not get_repeater_value(variant.operator):
                return False
        # check for indels --> 'delins' and insertions --> 'ins'
        elif variant.operator.lower() not in ValidValues.protein_operators:
            # check for amino acids
            if not variant.operator.lower() in ValidValues.amino_acids:
                if not variant.operator.lower() in ValidValues.amino_acids_single:
                    return False
    else:
        return False

    # Operator Value: should only contain amino acids for indel and insertion
    # operators. 
    if variant.operator_value.strip() != '':
        # frameshifts        
        if variant.operator_value[0:2] == 'fs':
            if len(variant.operator_value) > 2:
                if variant.operator_value[2].lower() in ('*', 'x'):
					# check for '];['
                    if '];[' in variant.operator_value[3:]:
                        p = variant.operator_value.index('];[')
                        if variant.operator_value[3:p] != '':
                            if not variant.operator_value[3:p].isdigit():
                                return False
                    else:                        
                        if not variant.operator_value[3:].isdigit():
                            return False
                else:
                    return False
        else:
            # ignore if operator begins with ']' or ')'
            if ']' not in variant.operator_value and ')' not in variant.operator_value:
                # operator value can not be empty if the operator is an indel or insertion
                if variant.operator.lower() not in ValidValues.protein_operators:
                    # The length of operator string should be divisble by 3 since the amino acid
                    # codes should only be 3 chars long.
                    if not len(variant.operator_value) % 3 != 0:
                        return False
                    else:
                        # need to check each amino acid if valid
                        amino_acids = split(variant.operator_value.lower(), 3)
                        for amino_acid in amino_acids:
                            item_found = False
                            if amino_acid in ValidValues.amino_acids:
                                item_found = True
                            # if amino acid not found
                            if not item_found:
                                return False
        
    return True