def ConsurfParser(input, input_format=None): contents = input.split('\n') output = [] for line in contents: line = line.lstrip().split() if len(line) < 4 or not line[0].isnumeric( ) or not line[3][0].isnumeric(): continue else: score = int(line[3][0]) if score > 9: raise InvalidFormat( 'Unable to parse prediction on consurf file: score above 9 detected!' ) else: output.append(score) if not output: raise InvalidFormat('Unable to parse prediction on consurf file') else: return output
def CCMpredParser(input, input_format=None): contents = input.split('\n') output = [] for res_1, line in enumerate(contents, 1): line = line.lstrip().split() if not line or line[0].isalpha() or len(line) == 1 or '#' in line[0]: continue for res_2, raw_score in enumerate(line, 1): try: raw_score = float(raw_score) except ValueError: raise InvalidFormat('Unable to parse contacts') if raw_score == '' or raw_score < 0.1: continue seq_distance = abs(res_1 - res_2) if seq_distance >= 5: contact = [res_1, res_2, raw_score] contact[:2] = sorted(contact[:2], reverse=True) output.append((tuple(contact[:2]), contact[2])) if not output: raise InvalidFormat('Unable to parse contacts') else: unique_contacts = get_unique_contacts(output) return unique_contacts
def CASPRR2Parser(input, input_format=None): contents = input.split('\n') output = [] res_1_idx = 0 res_2_idx = 1 raw_score_idx = 2 line_size = 13 for idx, line in enumerate(contents): line = line.lstrip().rstrip().split() if not line or len(line) != line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit(): continue res_1 = int(line[res_1_idx]) res_2 = int(line[res_2_idx]) seq_distance = res_1 - res_2 if abs(seq_distance) >= 5: raw_score = float(line[raw_score_idx]) distance_probabilities = [float(p) for p in line[raw_score_idx + 1:]] distance_score = max(distance_probabilities) distance_bin = distance_probabilities.index(distance_score) contact = [res_1, res_2, raw_score, distance_bin, distance_score] contact[:2] = sorted(contact[:2], reverse=True) output.append((tuple(contact[:2]), *contact[2:])) if not output: raise InvalidFormat('Unable to parse CASPRR_MODE_2 file') else: unique_contacts = get_unique_distances(output) if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]): raise InvalidFormat('Unable to parse CASPRR_MODE_2 file') return unique_contacts
def PDBParser(input, input_format=None): try: parser = BioPDBParser().get_structure('pdb', io.StringIO(input)) chain = list(parser.get_chains())[0] remove_atoms(chain) contacts = get_chain_contacts(chain) except: raise InvalidFormat('Unable to parse contacts') if not contacts: raise InvalidFormat('Unable to parse contacts') output = ["PDB"] output += sorted(contacts, key=itemgetter(2), reverse=True) return output
def ContactParser(input, input_format): contents = input.split('\n') output = [] res_1_idx = FieldResidueOneContactFormats.__getattr__(input_format).value res_2_idx = FieldResidueTwoContactFormats.__getattr__(input_format).value raw_score_idx = FieldRawScoreContactFormats.__getattr__(input_format).value line_size = LineSizeContactFormats.__getattr__(input_format).value regex = FieldSeparatorContactFormats.__getattr__(input_format).value for idx, line in enumerate(contents): line = line.lstrip().rstrip() line = re.split(regex, line) if not line or len(line) < line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit(): continue res_1 = int(line[res_1_idx]) res_2 = int(line[res_2_idx]) seq_distance = res_1 - res_2 if abs(seq_distance) >= 5: if raw_score_idx is not None: raw_score = float(line[raw_score_idx]) else: raw_score = 0 contact = [res_1, res_2, raw_score] contact[:2] = sorted(contact[:2], reverse=True) output.append((tuple(contact[:2]), contact[2])) if not output: raise InvalidFormat('Unable to parse contacts') else: unique_contacts = get_unique_contacts(output) return unique_contacts
def guess_psipred_format(contents): for line in contents: if '# PSIPRED VFORMAT' in line: return Ss2Parser elif '# PSIPRED HFORMAT' in line: return HorizParser raise InvalidFormat('Unable to guess psipred file format')
def Ss2Parser(contents): output = [] for line in contents: line = line.split() if len(line) != 6 or line[0] == '#': continue elif line[2] == 'H': output.append(SecondaryStructureStates.HELIX.value) elif line[2] == 'C': output.append(SecondaryStructureStates.COIL.value) elif line[2] == 'E': output.append(SecondaryStructureStates.SHEET.value) else: raise InvalidFormat( 'Invalid secondary structure element {}'.format(line[2])) if not output: raise InvalidFormat('Unable to parse prediction in psipred file') else: return output
def A3mParser(input, input_format=None): sequences, seq_lenght, n_sequences = extract_sequences(input) if sequences is None or len( sequences) <= 1 or n_sequences < 0.1 or seq_lenght < 1: raise InvalidFormat('Unable to parse contents of A3M MSA file') residue_count = [0 for x in range(1, seq_lenght + 1)] try: for sequence in sequences: idx = 0 for residue in sequence: if residue.islower(): continue elif residue != '-': residue_count[idx] += 1 idx += 1 except IndexError: raise InvalidFormat('Unable to parse the A3M MSA file') norm = [int(round(x / n_sequences)) for x in residue_count] return norm
def HorizParser(contents): output = [] for line in contents: if 'Pred: ' in line and line[0] != '#': prediction = line.split()[-1] for residue in prediction: if residue == 'H': output.append(SecondaryStructureStates.HELIX.value) elif residue == 'C': output.append(SecondaryStructureStates.COIL.value) elif residue == 'E': output.append(SecondaryStructureStates.SHEET.value) else: raise InvalidFormat( 'Invalid secondary structure element {}'.format( residue)) if not output: raise InvalidFormat('Unable to parse prediction in psipred file') else: return output
def IupredParser(input, input_format=None): contents = input.split('\n') output = [] for line in contents: line = line.lstrip().split() if len(line) < 1 or line[0] == '#' or len(line) < 3: continue else: try: score = float(line[2]) except ValueError: raise InvalidFormat('Invalid score field {}'.format(line[2])) if score >= 0.5: output.append(DisorderStates.DISORDER.value) else: output.append(DisorderStates.ORDER.value) if not output: raise InvalidFormat('Unable to parse prediction on iupred file') else: return output
def NpzParser(input, input_format=None): output = [] content_type, content_string = input.split(',') try: decoded = base64.b64decode(content_string) archive = np.load(io.BytesIO(decoded), allow_pickle=True) array = archive['dist'] tmp_output = parse_array(array) except (OSError, KeyError, IndexError) as e: raise InvalidFormat('Unable to parse distance NPZ file') for contact in tmp_output: # contact = [res_1, res_2, raw_score, distance_bin, distance_score] contact[:2] = sorted(contact[:2], reverse=True) output.append((tuple(contact[:2]), *contact[2:])) if not output: raise InvalidFormat('Unable to parse NPZ file') else: unique_contacts = get_unique_distances(output) if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]): raise InvalidFormat('Unable to parse trRosetta_NPZ file') return unique_contacts
def TopconsParser(input, input_format=None): contents = input.split('\n') try: topcons_prediction = contents[contents.index('TOPCONS predicted topology:') + 1].rstrip() except ValueError as e: raise InvalidFormat output = [] for residue in topcons_prediction.rstrip().lstrip(): if residue == 'i': output.append(MembraneStates.INSIDE.value) elif residue == 'o': output.append(MembraneStates.OUTSIDE.value) elif residue == 'M': output.append(MembraneStates.INSERTED.value) else: raise InvalidFormat('Invalid residue topology {}'.format(residue)) if not output: raise InvalidFormat('Unable to parse prediction in topcons file') else: return output
def MappredParser(input, input_format=None): contents = input.split('\n') output = [] res_1_idx = 0 res_2_idx = 1 line_size = 36 for idx, line in enumerate(contents): line = line.lstrip().rstrip().split() if not line or len(line) < line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit(): continue res_1 = int(line[res_1_idx]) res_2 = int(line[res_2_idx]) seq_distance = res_1 - res_2 if abs(seq_distance) >= 5: line = [float(prob) for prob in line[2:]] raw_score = sum([prob for prob in line[:9]]) distance_probabilities = [line[0]] distance_probabilities += [sum(line[x:x+4]) for x in range(1, 30, 4)] distance_probabilities.append(line[-1]) distance_score = max(distance_probabilities) distance_bin = distance_probabilities.index(distance_score) contact = [res_1, res_2, raw_score, distance_bin, distance_score] contact[:2] = sorted(contact[:2], reverse=True) output.append((tuple(contact[:2]), *contact[2:])) if not output: raise InvalidFormat('Unable to parse MapPred file') else: unique_contacts = get_unique_distances(output) if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]): raise InvalidFormat('Unable to parse MapPred file') return unique_contacts
def ComsatParser(input): contents = input.split('\n') output = [] for line in contents: line = line.lstrip().split() if not line or line[0].isalpha(): continue elif line[0].isdigit() and line[2].isdigit() and len(line) >= 5: if abs(int(line[0]) - int(line[2])) >= 5: output.append((int(line[0]), int(line[2]), (0))) if not output: raise InvalidFormat('Unable to parse contacts') else: output = sorted(output, key=itemgetter(2), reverse=True) return output
def CCMpredParser(input): contents = input.split('\n') output = [] data = [] for line in contents: line = line.lstrip().split() if not line or line[0].isalpha() or len(line) == 1: continue else: data.append(line) for res_1, score_array in enumerate(data, 1): for res_2, score in enumerate(score_array, 1): if abs((res_1) - int(res_2)) >= 5 and score != '' and float(score) > 0: output.append((int(res_1), int(res_2), float(score))) if not output: raise InvalidFormat('Unable to parse contacts') else: output = sorted(output, key=itemgetter(2), reverse=True) return output