def parser(variant_string, variant): # for amino acids only the 3 char is acceptable which is the preferred method # according to HGVS. eg: For Tryptophan only 'Tyr' is acceptable, not 'Y' # since we have already got the v_type, set to position 2 after the period i = 2 # --- Position/ --- # ignore the '[' or '(' if variant_string[i] == '[' or variant_string[i] == '(': i += 1 allele = True # if '?' unknown or '=' no change expected if variant_string[i] == '?' or variant_string[i] == '=': variant.position = variant_string[i] i += 1 variant.operator_value = variant_string[i:] else: # get the position amino acid if (i+3) < len(variant_string): # get the first amino acid if variant_string[i] == '*': first_amino_acid = '*' else: first_amino_acid = variant_string[i:i+3] if not first_amino_acid.isalpha(): #check if first char is alphabet first_amino_acid = variant_string[i] i += 1 else: i += 3 # assume that it is a position value for now variant.position = first_amino_acid # get the position of the amino acid, which will be stored # in the postion intron for now start_position = i while i < len(variant_string): if check_numeric_value(variant_string[i]): i += 1 else: break # get the position value position = variant_string[start_position:i] # assume that it is a position intron value for now variant.position_intron = position if len(variant_string) > i: if variant_string[i] == '_': i += 1 if len(variant_string) > i: # set the first amino acid to range lower variant.position = '' variant.range_lower = first_amino_acid; # set postion intron value to range lower intron variant.position_intron = '' variant.range_lower_intron = position # get the range upper or second amino acid value if variant_string[i] == '*': variant.range_upper = '*' else: if variant_string[i:i+3].isalpha(): variant.range_upper = variant_string[i:i+3] i += 3 else: varaint.range_upper = variant_string[i] i += 1 # get the range upper intron start_position = i while i < len(variant_string): if check_numeric_value(variant_string[i]): i += 1 else: break variant.range_upper_intron = variant_string[start_position:i] # get the operator - the amino acid that has changed operator = variant_string[i:len(variant_string)] operator_value = '' if operator[0] == '*': operator = '*' elif 'delins' in operator.lower(): operator = 'delins' operator_value = variant_string[i+6:len(variant_string)] elif variant_string[i] == '(': operator = variant_string[i:len(variant_string)] operator_value = '' elif operator.lower() in ('fs*', 'fsx', 'fster'): f = operator.index('fs*') operator_value = operator[f:] operator = operator[0:f] else: if variant_string[i:i+3].isalpha(): operator = variant_string[i:i+3] operator_value = variant_string[i+3:len(variant_string)] else: operator = variant_string[i] operator_value = variant_string[i+1:len(variant_string)] # set the operator variant.operator = operator variant.operator_value = operator_value return variant
def validate(variant): # Please not that since protein nomenclature is different to genomic/cDNA structure the values are # stored differently in the VariantName Class. # Since the protein positions and ranges contain the amino acid and numeric value for the index # of that amino acid the variant position, range_lower and range_upper will store the amino acids # while the intron fields will store the index. # position: amino acid if variant.position.strip() != '': if variant.position not in ('?', '='): if not variant.position.lower() in ValidValues.amino_acids: if not variant.position.lower( ) in ValidValues.amino_acids_single: return False else: return True # position: index of the amino acid if variant.position_intron.strip() != '': # if position has a value then intron should too if variant.position.strip() != '': if not check_numeric_value(variant.position_intron): return False else: return False # range_lower: amino acid if variant.range_lower.strip() != '': if variant.position.strip() == '': if not variant.range_lower.lower() in ValidValues.amino_acids: return False else: return False # range_lower: index of the amino acid if variant.range_lower_intron.strip() != '': if variant.range_lower.strip() != '': if not check_numeric_value(variant.range_lower_intron): return False else: return False # range_upper: amino acid if variant.range_upper.strip() != '': if variant.range_lower.strip() != '': if not variant.range_upper.lower() in ValidValues.amino_acids: return False else: return False # range_upper: index of the amino acid if variant.range_upper_intron.strip() != '': if variant.range_upper.strip() != '': if not check_numeric_value(variant.range_upper_intron): return False else: return False # Operator if variant.operator.strip() != '': # check for repeating range if not variant.operator[0] != '(' or not variant.operator[0] != '[': if not get_repeater_value(variant.operator): return False # check for indels --> 'delins' and insertions --> 'ins' elif variant.operator.lower() not in ValidValues.protein_operators: # check for amino acids if not variant.operator.lower() in ValidValues.amino_acids: if not variant.operator.lower( ) in ValidValues.amino_acids_single: return False else: return False # Operator Value: should only contain amino acids for indel and insertion # operators. if variant.operator_value.strip() != '': # frameshifts if variant.operator_value[0:2] == 'fs': if len(variant.operator_value) > 2: if variant.operator_value[2].lower() in ('*', 'x'): # check for '];[' if '];[' in variant.operator_value[3:]: p = variant.operator_value.index('];[') if variant.operator_value[3:p] != '': if not variant.operator_value[3:p].isdigit(): return False else: if not variant.operator_value[3:].isdigit(): return False else: return False else: # ignore if operator begins with ']' or ')' if ']' not in variant.operator_value and ')' not in variant.operator_value: # operator value can not be empty if the operator is an indel or insertion if variant.operator.lower( ) not in ValidValues.protein_operators: # The length of operator string should be divisble by 3 since the amino acid # codes should only be 3 chars long. if not len(variant.operator_value) % 3 != 0: return False else: # need to check each amino acid if valid amino_acids = split(variant.operator_value.lower(), 3) for amino_acid in amino_acids: item_found = False if amino_acid in ValidValues.amino_acids: item_found = True # if amino acid not found if not item_found: return False return True
def parser(variant_string, variant): # for amino acids only the 3 char is acceptable which is the preferred method # according to HGVS. eg: For Tryptophan only 'Tyr' is acceptable, not 'Y' # since we have already got the v_type, set to position 2 after the period i = 2 # --- Position/ --- # ignore the '[' or '(' if variant_string[i] == '[' or variant_string[i] == '(': i += 1 allele = True # if '?' unknown or '=' no change expected if variant_string[i] == '?' or variant_string[i] == '=': variant.position = variant_string[i] i += 1 variant.operator_value = variant_string[i:] else: # get the position amino acid if (i + 3) < len(variant_string): # get the first amino acid if variant_string[i] == '*': first_amino_acid = '*' else: first_amino_acid = variant_string[i:i + 3] if not first_amino_acid.isalpha(): #check if first char is alphabet first_amino_acid = variant_string[i] i += 1 else: i += 3 # assume that it is a position value for now variant.position = first_amino_acid # get the position of the amino acid, which will be stored # in the postion intron for now start_position = i while i < len(variant_string): if check_numeric_value(variant_string[i]): i += 1 else: break # get the position value position = variant_string[start_position:i] # assume that it is a position intron value for now variant.position_intron = position if len(variant_string) > i: if variant_string[i] == '_': i += 1 if len(variant_string) > i: # set the first amino acid to range lower variant.position = '' variant.range_lower = first_amino_acid # set postion intron value to range lower intron variant.position_intron = '' variant.range_lower_intron = position # get the range upper or second amino acid value if variant_string[i] == '*': variant.range_upper = '*' else: if variant_string[i:i + 3].isalpha(): variant.range_upper = variant_string[i:i + 3] i += 3 else: varaint.range_upper = variant_string[i] i += 1 # get the range upper intron start_position = i while i < len(variant_string): if check_numeric_value(variant_string[i]): i += 1 else: break variant.range_upper_intron = variant_string[ start_position:i] # get the operator - the amino acid that has changed operator = variant_string[i:len(variant_string)] operator_value = '' if operator[0] == '*': operator = '*' elif 'delins' in operator.lower(): operator = 'delins' operator_value = variant_string[i + 6:len(variant_string)] elif variant_string[i] == '(': operator = variant_string[i:len(variant_string)] operator_value = '' elif operator.lower() in ('fs*', 'fsx', 'fster'): f = operator.index('fs*') operator_value = operator[f:] operator = operator[0:f] else: if variant_string[i:i + 3].isalpha(): operator = variant_string[i:i + 3] operator_value = variant_string[i + 3:len(variant_string)] else: operator = variant_string[i] operator_value = variant_string[i + 1:len(variant_string)] # set the operator variant.operator = operator variant.operator_value = operator_value return variant
def validate(variant): # Please not that since protein nomenclature is different to genomic/cDNA structure the values are # stored differently in the VariantName Class. # Since the protein positions and ranges contain the amino acid and numeric value for the index # of that amino acid the variant position, range_lower and range_upper will store the amino acids # while the intron fields will store the index. # position: amino acid if variant.position.strip() != '': if variant.position not in ('?', '='): if not variant.position.lower() in ValidValues.amino_acids: if not variant.position.lower() in ValidValues.amino_acids_single: return False else: return True # position: index of the amino acid if variant.position_intron.strip() != '': # if position has a value then intron should too if variant.position.strip() != '': if not check_numeric_value(variant.position_intron): return False else: return False # range_lower: amino acid if variant.range_lower.strip() != '': if variant.position.strip() == '': if not variant.range_lower.lower() in ValidValues.amino_acids: return False else: return False # range_lower: index of the amino acid if variant.range_lower_intron.strip() != '': if variant.range_lower.strip() != '': if not check_numeric_value(variant.range_lower_intron): return False else: return False # range_upper: amino acid if variant.range_upper.strip() != '': if variant.range_lower.strip() != '': if not variant.range_upper.lower() in ValidValues.amino_acids: return False else: return False # range_upper: index of the amino acid if variant.range_upper_intron.strip() != '': if variant.range_upper.strip() != '': if not check_numeric_value(variant.range_upper_intron): return False else: return False # Operator if variant.operator.strip() != '': # check for repeating range if not variant.operator[0] != '(' or not variant.operator[0] != '[': if not get_repeater_value(variant.operator): return False # check for indels --> 'delins' and insertions --> 'ins' elif variant.operator.lower() not in ValidValues.protein_operators: # check for amino acids if not variant.operator.lower() in ValidValues.amino_acids: if not variant.operator.lower() in ValidValues.amino_acids_single: return False else: return False # Operator Value: should only contain amino acids for indel and insertion # operators. if variant.operator_value.strip() != '': # frameshifts if variant.operator_value[0:2] == 'fs': if len(variant.operator_value) > 2: if variant.operator_value[2].lower() in ('*', 'x'): # check for '];[' if '];[' in variant.operator_value[3:]: p = variant.operator_value.index('];[') if variant.operator_value[3:p] != '': if not variant.operator_value[3:p].isdigit(): return False else: if not variant.operator_value[3:].isdigit(): return False else: return False else: # ignore if operator begins with ']' or ')' if ']' not in variant.operator_value and ')' not in variant.operator_value: # operator value can not be empty if the operator is an indel or insertion if variant.operator.lower() not in ValidValues.protein_operators: # The length of operator string should be divisble by 3 since the amino acid # codes should only be 3 chars long. if not len(variant.operator_value) % 3 != 0: return False else: # need to check each amino acid if valid amino_acids = split(variant.operator_value.lower(), 3) for amino_acid in amino_acids: item_found = False if amino_acid in ValidValues.amino_acids: item_found = True # if amino acid not found if not item_found: return False return True