def EGAAC(fastas, window=5, **kw): if check_sequences.check_fasta_with_equal_length == False: print( 'Error: for "EGAAC" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 if window < 1: print('Error: the sliding window should be greater than zero' + '\n\n') return 0 if check_sequences.get_min_sequence_length(fastas) < window: print( 'Error: all the sequence length should be larger than the sliding window :' + str(window) + '\n\n') return 0 group = { 'alphaticr': 'GAVLMI', 'aromatic': 'FYW', 'postivecharger': 'KRH', 'negativecharger': 'DE', 'uncharger': 'STCPNQ' } groupKey = group.keys() encodings = [] header = ['#', 'label'] for w in range(1, len(fastas[0][1]) - window + 2): for g in groupKey: header.append('SW.' + str(w) + '.' + g) encodings.append(header) for i in fastas: name, sequence, label = i[0], i[1], i[2] code = [name, label] for j in range(len(sequence)): if j + window <= len(sequence): count = Counter(sequence[j:j + window]) myDict = {} for key in groupKey: for aa in group[key]: myDict[key] = myDict.get(key, 0) + count[aa] for key in groupKey: code.append(myDict[key] / window) encodings.append(code) return encodings
def CKSNAP(fastas, gap=2, **kw): if gap < 0: print('Error: the gap should be equal or greater than zero' + '\n\n') return 0 if check_sequences.get_min_sequence_length(fastas) < gap + 2: print( 'Error: all the sequence length should be larger than the (gap value) + 2 = ' + str(gap + 2) + '\n\n') return 0 AA = kw['order'] if kw['order'] != None else 'ACGT' encodings = [] aaPairs = [] for aa1 in AA: for aa2 in AA: aaPairs.append(aa1 + aa2) header = ['#', 'label'] for g in range(gap + 1): for aa in aaPairs: header.append(aa + '.gap' + str(g)) encodings.append(header) for i in fastas: name, sequence, label = i[0], i[1], i[2] code = [name, label] for g in range(gap + 1): myDict = {} for pair in aaPairs: myDict[pair] = 0 sum = 0 for index1 in range(len(sequence)): index2 = index1 + g + 1 if index1 < len(sequence) and index2 < len( sequence ) and sequence[index1] in AA and sequence[index2] in AA: myDict[sequence[index1] + sequence[index2]] = myDict[sequence[index1] + sequence[index2]] + 1 sum = sum + 1 for pair in aaPairs: code.append(myDict[pair] / sum) encodings.append(code) return encodings
def ENAC(fastas, window=5, **kw): if check_sequences.check_fasta_with_equal_length == False: print( 'Error: for "ENAC" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 if window < 1: print('Error: the sliding window should be greater than zero' + '\n\n') return 0 if check_sequences.get_min_sequence_length(fastas) < window: print( 'Error: all the sequence length should be larger than the sliding window :' + str(window) + '\n\n') return 0 AA = kw['order'] if kw['order'] != None else 'ACGU' encodings = [] header = ['#', 'label'] for w in range(1, len(fastas[0][1]) - window + 2): for aa in AA: header.append('SW.' + str(w) + '.' + aa) encodings.append(header) for i in fastas: name, sequence, label = i[0], i[1], i[2] code = [name, label] for j in range(len(sequence)): if j < len(sequence) and j + window <= len(sequence): count = Counter(sequence[j:j + window]) for key in count: count[key] = count[key] / len(sequence[j:j + window]) for aa in AA: code.append(count[aa]) encodings.append(code) return encodings
def CKSAAGP(fastas, gap=5, **kw): if gap < 0: print('Error: the gap should be equal or greater than zero' + '\n\n') return 0 if check_sequences.get_min_sequence_length(fastas) < gap + 2: print( 'Error: all the sequence length should be larger than the (gap value) + 2 = ' + str(gap + 2) + '\n\n') return 0 group = { 'alphaticr': 'GAVLMI', 'aromatic': 'FYW', 'postivecharger': 'KRH', 'negativecharger': 'DE', 'uncharger': 'STCPNQ' } AA = 'ARNDCQEGHILKMFPSTWYV' groupKey = group.keys() index = {} for key in groupKey: for aa in group[key]: index[aa] = key gPairIndex = [] for key1 in groupKey: for key2 in groupKey: gPairIndex.append(key1 + '.' + key2) encodings = [] header = ['#', 'label'] for g in range(gap + 1): for p in gPairIndex: header.append(p + '.gap' + str(g)) encodings.append(header) for i in fastas: name, sequence, label = i[0], re.sub('-', '', i[1]), i[2] code = [name, label] for g in range(gap + 1): gPair = generateGroupPairs(groupKey) sum = 0 for p1 in range(len(sequence)): p2 = p1 + g + 1 if p2 < len(sequence ) and sequence[p1] in AA and sequence[p2] in AA: gPair[index[sequence[p1]] + '.' + index[sequence[p2]]] = gPair[index[sequence[p1]] + '.' + index[sequence[p2]]] + 1 sum = sum + 1 if sum == 0: for gp in gPairIndex: code.append(0) else: for gp in gPairIndex: code.append(gPair[gp] / sum) encodings.append(code) return encodings