Example #1
0
def ConsurfParser(input, input_format=None):
    contents = input.split('\n')
    output = []

    for line in contents:

        line = line.lstrip().split()

        if len(line) < 4 or not line[0].isnumeric(
        ) or not line[3][0].isnumeric():
            continue
        else:
            score = int(line[3][0])

        if score > 9:
            raise InvalidFormat(
                'Unable to parse prediction on consurf file: score above 9 detected!'
            )
        else:
            output.append(score)

    if not output:
        raise InvalidFormat('Unable to parse prediction on consurf file')
    else:
        return output
Example #2
0
def CCMpredParser(input, input_format=None):
    contents = input.split('\n')

    output = []

    for res_1, line in enumerate(contents, 1):
        line = line.lstrip().split()
        if not line or line[0].isalpha() or len(line) == 1 or '#' in line[0]:
            continue

        for res_2, raw_score in enumerate(line, 1):
            try:
                raw_score = float(raw_score)
            except ValueError:
                raise InvalidFormat('Unable to parse contacts')
            if raw_score == '' or raw_score < 0.1:
                continue

            seq_distance = abs(res_1 - res_2)

            if seq_distance >= 5:
                contact = [res_1, res_2, raw_score]
                contact[:2] = sorted(contact[:2], reverse=True)
                output.append((tuple(contact[:2]), contact[2]))

    if not output:
        raise InvalidFormat('Unable to parse contacts')
    else:
        unique_contacts = get_unique_contacts(output)
        return unique_contacts
Example #3
0
def CASPRR2Parser(input, input_format=None):
    contents = input.split('\n')
    output = []
    res_1_idx = 0
    res_2_idx = 1
    raw_score_idx = 2
    line_size = 13

    for idx, line in enumerate(contents):

        line = line.lstrip().rstrip().split()

        if not line or len(line) != line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit():
            continue

        res_1 = int(line[res_1_idx])
        res_2 = int(line[res_2_idx])
        seq_distance = res_1 - res_2

        if abs(seq_distance) >= 5:
            raw_score = float(line[raw_score_idx])
            distance_probabilities = [float(p) for p in line[raw_score_idx + 1:]]
            distance_score = max(distance_probabilities)
            distance_bin = distance_probabilities.index(distance_score)
            contact = [res_1, res_2, raw_score, distance_bin, distance_score]
            contact[:2] = sorted(contact[:2], reverse=True)
            output.append((tuple(contact[:2]), *contact[2:]))

    if not output:
        raise InvalidFormat('Unable to parse CASPRR_MODE_2 file')
    else:
        unique_contacts = get_unique_distances(output)
        if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]):
            raise InvalidFormat('Unable to parse CASPRR_MODE_2 file')
        return unique_contacts
Example #4
0
def PDBParser(input, input_format=None):
    try:
        parser = BioPDBParser().get_structure('pdb', io.StringIO(input))
        chain = list(parser.get_chains())[0]
        remove_atoms(chain)
        contacts = get_chain_contacts(chain)
    except:
        raise InvalidFormat('Unable to parse contacts')

    if not contacts:
        raise InvalidFormat('Unable to parse contacts')

    output = ["PDB"]
    output += sorted(contacts, key=itemgetter(2), reverse=True)
    return output
Example #5
0
def ContactParser(input, input_format):
    contents = input.split('\n')
    output = []
    res_1_idx = FieldResidueOneContactFormats.__getattr__(input_format).value
    res_2_idx = FieldResidueTwoContactFormats.__getattr__(input_format).value
    raw_score_idx = FieldRawScoreContactFormats.__getattr__(input_format).value
    line_size = LineSizeContactFormats.__getattr__(input_format).value
    regex = FieldSeparatorContactFormats.__getattr__(input_format).value

    for idx, line in enumerate(contents):

        line = line.lstrip().rstrip()
        line = re.split(regex, line)

        if not line or len(line) < line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit():
            continue

        res_1 = int(line[res_1_idx])
        res_2 = int(line[res_2_idx])
        seq_distance = res_1 - res_2

        if abs(seq_distance) >= 5:
            if raw_score_idx is not None:
                raw_score = float(line[raw_score_idx])
            else:
                raw_score = 0
            contact = [res_1, res_2, raw_score]
            contact[:2] = sorted(contact[:2], reverse=True)
            output.append((tuple(contact[:2]), contact[2]))

    if not output:
        raise InvalidFormat('Unable to parse contacts')
    else:
        unique_contacts = get_unique_contacts(output)
        return unique_contacts
Example #6
0
def guess_psipred_format(contents):
    for line in contents:
        if '# PSIPRED VFORMAT' in line:
            return Ss2Parser
        elif '# PSIPRED HFORMAT' in line:
            return HorizParser

    raise InvalidFormat('Unable to guess psipred file format')
Example #7
0
def Ss2Parser(contents):
    output = []

    for line in contents:
        line = line.split()
        if len(line) != 6 or line[0] == '#':
            continue
        elif line[2] == 'H':
            output.append(SecondaryStructureStates.HELIX.value)
        elif line[2] == 'C':
            output.append(SecondaryStructureStates.COIL.value)
        elif line[2] == 'E':
            output.append(SecondaryStructureStates.SHEET.value)
        else:
            raise InvalidFormat(
                'Invalid secondary structure element {}'.format(line[2]))

    if not output:
        raise InvalidFormat('Unable to parse prediction in psipred file')
    else:
        return output
Example #8
0
def A3mParser(input, input_format=None):
    sequences, seq_lenght, n_sequences = extract_sequences(input)
    if sequences is None or len(
            sequences) <= 1 or n_sequences < 0.1 or seq_lenght < 1:
        raise InvalidFormat('Unable to parse contents of A3M MSA file')

    residue_count = [0 for x in range(1, seq_lenght + 1)]
    try:
        for sequence in sequences:
            idx = 0
            for residue in sequence:
                if residue.islower():
                    continue
                elif residue != '-':
                    residue_count[idx] += 1
                idx += 1
    except IndexError:
        raise InvalidFormat('Unable to parse the A3M MSA file')

    norm = [int(round(x / n_sequences)) for x in residue_count]
    return norm
Example #9
0
def HorizParser(contents):
    output = []

    for line in contents:
        if 'Pred: ' in line and line[0] != '#':
            prediction = line.split()[-1]
            for residue in prediction:
                if residue == 'H':
                    output.append(SecondaryStructureStates.HELIX.value)
                elif residue == 'C':
                    output.append(SecondaryStructureStates.COIL.value)
                elif residue == 'E':
                    output.append(SecondaryStructureStates.SHEET.value)
                else:
                    raise InvalidFormat(
                        'Invalid secondary structure element {}'.format(
                            residue))

    if not output:
        raise InvalidFormat('Unable to parse prediction in psipred file')
    else:
        return output
Example #10
0
def IupredParser(input, input_format=None):
    contents = input.split('\n')
    output = []

    for line in contents:

        line = line.lstrip().split()
        if len(line) < 1 or line[0] == '#' or len(line) < 3:
            continue
        else:
            try:
                score = float(line[2])
            except ValueError:
                raise InvalidFormat('Invalid score field {}'.format(line[2]))
        if score >= 0.5:
            output.append(DisorderStates.DISORDER.value)
        else:
            output.append(DisorderStates.ORDER.value)

    if not output:
        raise InvalidFormat('Unable to parse prediction on iupred file')
    else:
        return output
Example #11
0
def NpzParser(input, input_format=None):
    output = []
    content_type, content_string = input.split(',')
    try:
        decoded = base64.b64decode(content_string)
        archive = np.load(io.BytesIO(decoded), allow_pickle=True)
        array = archive['dist']
        tmp_output = parse_array(array)
    except (OSError, KeyError, IndexError) as e:
        raise InvalidFormat('Unable to parse distance NPZ file')

    for contact in tmp_output:
        # contact = [res_1, res_2, raw_score, distance_bin, distance_score]
        contact[:2] = sorted(contact[:2], reverse=True)
        output.append((tuple(contact[:2]), *contact[2:]))

    if not output:
        raise InvalidFormat('Unable to parse NPZ file')
    else:
        unique_contacts = get_unique_distances(output)
        if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]):
            raise InvalidFormat('Unable to parse trRosetta_NPZ file')
        return unique_contacts
Example #12
0
def TopconsParser(input, input_format=None):
    contents = input.split('\n')

    try:
        topcons_prediction = contents[contents.index('TOPCONS predicted topology:') + 1].rstrip()
    except ValueError as e:
        raise InvalidFormat

    output = []
    for residue in topcons_prediction.rstrip().lstrip():
        if residue == 'i':
            output.append(MembraneStates.INSIDE.value)
        elif residue == 'o':
            output.append(MembraneStates.OUTSIDE.value)
        elif residue == 'M':
            output.append(MembraneStates.INSERTED.value)
        else:
            raise InvalidFormat('Invalid residue topology {}'.format(residue))

    if not output:
        raise InvalidFormat('Unable to parse prediction in topcons file')
    else:
        return output
Example #13
0
def MappredParser(input, input_format=None):
    contents = input.split('\n')
    output = []
    res_1_idx = 0
    res_2_idx = 1
    line_size = 36

    for idx, line in enumerate(contents):

        line = line.lstrip().rstrip().split()

        if not line or len(line) < line_size or not line[res_1_idx].isdigit() or not line[res_2_idx].isdigit():
            continue

        res_1 = int(line[res_1_idx])
        res_2 = int(line[res_2_idx])
        seq_distance = res_1 - res_2

        if abs(seq_distance) >= 5:
            line = [float(prob) for prob in line[2:]]
            raw_score = sum([prob for prob in line[:9]])
            distance_probabilities = [line[0]]
            distance_probabilities += [sum(line[x:x+4]) for x in range(1, 30, 4)]
            distance_probabilities.append(line[-1])
            distance_score = max(distance_probabilities)
            distance_bin = distance_probabilities.index(distance_score)
            contact = [res_1, res_2, raw_score, distance_bin, distance_score]
            contact[:2] = sorted(contact[:2], reverse=True)
            output.append((tuple(contact[:2]), *contact[2:]))

    if not output:
        raise InvalidFormat('Unable to parse MapPred file')
    else:
        unique_contacts = get_unique_distances(output)
        if any([p for p in unique_contacts[1:] if p[3] > 9 or p[4] > 1]):
            raise InvalidFormat('Unable to parse MapPred file')
        return unique_contacts
Example #14
0
def ComsatParser(input):
    contents = input.split('\n')
    output = []

    for line in contents:
        line = line.lstrip().split()
        if not line or line[0].isalpha():
            continue
        elif line[0].isdigit() and line[2].isdigit() and len(line) >= 5:
            if abs(int(line[0]) - int(line[2])) >= 5:
                output.append((int(line[0]), int(line[2]), (0)))

    if not output:
        raise InvalidFormat('Unable to parse contacts')
    else:
        output = sorted(output, key=itemgetter(2), reverse=True)
        return output
Example #15
0
def CCMpredParser(input):
    contents = input.split('\n')

    output = []

    data = []
    for line in contents:
        line = line.lstrip().split()
        if not line or line[0].isalpha() or len(line) == 1:
            continue
        else:
            data.append(line)

    for res_1, score_array in enumerate(data, 1):
        for res_2, score in enumerate(score_array, 1):
            if abs((res_1) - int(res_2)) >= 5 and score != '' and float(score) > 0:
                output.append((int(res_1), int(res_2), float(score)))

    if not output:
        raise InvalidFormat('Unable to parse contacts')
    else:
        output = sorted(output, key=itemgetter(2), reverse=True)
        return output