Exemple #1
0
def init_aligner(allow_target_gaps=False, allow_target_mismatches=False):
    """Creates an aligner whose weights penalize excessive gaps, make gaps in the
    ProteinNet sequence impossible, and prefer gaps at the tail ends of sequences."""
    a = Align.PairwiseAligner()
    a.mismatch = -np.inf
    a.mismatch_score = -np.inf

    # Don't allow for gaps or mismatches with the target sequence
    if not allow_target_gaps:
        a.target_gap_score = -np.inf

    # Do not let matching items overwhelm determining where gaps should go
    if not allow_target_gaps:
        a.match = 10
    else:
        a.match = 200

    if allow_target_mismatches:
        a.mismatch = 200

    # Generally, prefer to extend gaps than to create them
    a.query_extend_gap_score = 99
    a.query_open_gap_score = 49

    # Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
    a.query_end_open_gap_score = 50
    a.query_end_extend_gap_score = 100

    return a
def matrix(seq, file_name, idslist):
    '''
      function takes sequences, result file name and genes id list.find similarity score
      of sequences and form similarity matrix
    '''

    _file_name = file_name

    cache = {}
    row = col = 1
    work_book = xlsxwriter.Workbook(_file_name + '.xlsx')
    work_sheet = work_book.add_worksheet()
    align = Align.PairwiseAligner()

    for s in range(len(seq)):
        work_sheet.write(0, 0, 'Gene Id')
        work_sheet.write(0, col, idslist[s])
        work_sheet.write(row, 0, idslist[s])
        row = row + 1
        col = col + 1

        for i in range(len(seq)):
            score = align.score(seq[s], seq[i])
            score = score / len(seq[s])

            if ((idslist[i], idslist[s])
                    and (idslist[s], idslist[i]) in cache.keys()):
                score = cache[(idslist[s], idslist[i])]
                work_sheet.write((s + 1), (i + 1), score)
            else:

                work_sheet.write((s + 1), (i + 1), score)
                cache.update({(idslist[i], idslist[s]): score})

    work_book.close()
Exemple #3
0
    def pairwise_aligner(self):

        aligner = Align.PairwiseAligner()
        aligner.open_gap_score = -10
        aligner.extend_gap_score = -0.5
        aligner.target_end_gap_score = 0.0
        aligner.query_end_gap_score = 0.0
        # aligner.match = 2
        # aligner.mismatch = -1
        alignments = aligner.align(self.sequence1, self.sequence2)
        differences_list = []
        for alignment in sorted(alignments):
            split_alignment = str(alignment).split()
            assert len(
                split_alignment) % 3 == 0, "should be divisible by three"
            #symbolic_alignment_summary = split_alignment[int(len(split_alignment)/3):(int(len(split_alignment)/3)*2)]
            # turns out each part of pairwise alignment, that being
            # first aligned sequence, symbolic summary of alignment, and
            # other aligned seqeunce is on separate line already, & so can just
            # do:
            symbolic_alignment_summary = split_alignment[1]
            unaligned = symbolic_alignment_summary.count("X")  #by limiting
            # to the symbolic summary of the alignment, I don't have to
            # worry about 'X's in sequence making count wrong. And I also
            # don't have to worry about dividing gap indicators out of
            # double counting on the next line either.
            #print(unaligned)
            gaps = symbolic_alignment_summary.count("-")
            differences = unaligned + gaps
            differences_list.append(differences)
        align_score = aligner.score(self.sequence1, self.sequence2)
        self.score = align_score
        self.differences = min(differences_list)
Exemple #4
0
    def __init__(self, align_cfg: Dict, min_score: float, min_read_length_without_primers: int,
                 window_size: int):
        self._min_score = min_score
        self._window_size = window_size

        # Set logger
        logger = LoggerWrapper.get_logger()
        self._logger = logger

        self._min_primer_dimer_thresh = min_read_length_without_primers

        # Create Aligner
        # Init biopython aligner
        self._aligner = Align.PairwiseAligner()
        self._aligner.match_score = align_cfg["match_score"]
        self._aligner.mismatch_score = align_cfg["mismatch_score"]
        self._aligner.open_gap_score = align_cfg["open_gap_score"]
        self._aligner.extend_gap_score = align_cfg["extend_gap_score"]
        if align_cfg["substitution_matrix"] != "":
            if align_cfg["substitution_matrix"] in MatrixInfo.__dict__:
                self._aligner.substitution_matrix = MatrixInfo.__dict__[align_cfg["substitution_matrix"]]
                self._logger.warning("Shifting indels to cut-site isn't available for alignment with difference score"
                                     "substitution matrix. Contact package owner if this feature is required")
            else:
                raise AlignerSubstitutionDoesntExist(align_cfg["substitution_matrix"])
Exemple #5
0
    def _align_clusters(config, one, two, cutoff=0.3):
        """Constructs a cluster alignment using the given configuration."""
        LOG.info("%s vs %s", one.name, two.name)

        aligner = Align.PairwiseAligner()
        matrix = config.pop("substitution_matrix", "BLOSUM62")
        if matrix not in substitution_matrices.load():
            LOG.warning(
                "Invalid substitution matrix (%s), defaulting to BLOSUM62",
                matrix)
            matrix = "BLOSUM62"
        aligner.substitution_matrix = substitution_matrices.load(matrix)
        for k, v in config.items():
            setattr(aligner, k, v)

        alignment = Alignment(query=one, target=two)
        for locusA, locusB in product(one.loci, two.loci):
            for geneA, geneB in product(locusA.genes, locusB.genes):
                if not geneA.translation or not geneB.translation:
                    continue
                aln = aligner.align(geneA.translation, geneB.translation)
                identity, similarity = compute_identity(aln[0])
                if identity < cutoff:
                    continue
                alignment.add_link(geneA, geneB, identity, similarity)
        return alignment
Exemple #6
0
    def test_results(self):
        """
        testing results
        """

        for item in self.data:
            algo = SW(item['seq_a'], item['seq_b'],
                      item['match'], item['mismatch'], item['gap'])
            algo.initialize()
            algo.calculate_score()
            algo.traceback()
            algo.calculate_identity()
            # alignments = algo.get_alignments()[0]
            score = algo.get_score()

            aligner = Align.PairwiseAligner()
            aligner.mode = 'local'
            aligner.match_score = item['match']
            aligner.mismatch_score = item['mismatch']
            aligner.gap_score = item['gap']
            ref_result = aligner.align(item['seq_a'], item['seq_b'])
            ref_score = ref_result.score
            # ref_alignments = []

            # for align in ref_result:
            #     temp = str(align.format()).split('\n')
            #     ref_alignments.append([temp[0], temp[2]])

            # self.assertIn([alignments['algn_a'], alignments['algn_b']], ref_alignments)

            self.assertEqual(score, ref_score, item)
Exemple #7
0
 def find_orf_best_match(self, protein_seq_dict=None, min_score_thr=-1):
     """  Align the ORF translation against the sequences in the protein_seq_dict
         dict and return the id of the sequence with best score.
     Attributes:
         protein_seq_dict (dict): Dictionary of id:sequence of known proteins
         min_score_thr (int): Minimum alignment score to consider a hit
     """
     if (type(protein_seq_dict) is not dict):
         raise Exception(
             "protein_seq_dict must be a dictionary of if:sequence")
     aligner = Align.PairwiseAligner()
     aligner.open_gap_score = -10
     aligner.extend_gap_score = -10
     aligner.target_end_gap_score = -0.1
     aligner.query_end_gap_score = -0.1
     seq_to_match = self.translation
     aligner.match_score = 2
     aligner.mismatch_score = -1
     # Remove stop codon if present
     if (seq_to_match[-1] == "*"):
         seq_to_match = seq_to_match[:-1]
     best_hit = ["NA", -1]
     for id, seq in protein_seq_dict.items():
         aln = aligner.align(seq_to_match, seq.seq)
         aln = sorted(aln)[0]
         score = aln.score
         identity = utils.percent_identity(aln)
         id = "{}({:.2f}%/{:.2f}%)".format(id, identity[0], identity[1])
         if score > best_hit[1]:
             best_hit[0] = id
             best_hit[1] = score
     if best_hit[1] > min_score_thr:
         return best_hit[0]
     else:
         return "Unknown"
Exemple #8
0
def getAlignScore(line):
    barcodeUmi = line[0]
    unmappedSeq = line[1]
    aligner = Align.PairwiseAligner()
    aligner.match_score = 1
    aligner.mismatch_score = -1
    aligner.gap_score = -2
    aligner.query_gap_score = -1
    aligner.target_end_gap_score = 0

    barcodeUmi = barcodeUmi.split("_")
    barcode = barcodeUmi[0]
    umi = barcodeUmi[1]
    barcodeUmi = "".join(barcodeUmi)

    mappingResult = [aligner.align(barcodeUmi, x) for x in unmappedSeq]
    mappingScore = [x.score for x in mappingResult]
    barcodeUmiScore = max(mappingScore)
    bestScoreIndex = mappingScore.index(barcodeUmiScore)
    if bestScoreIndex % 2 == 0:
        mappingStrand = 0
    else:
        mappingStrand = 1

    bestAlign = mappingResult[bestScoreIndex][0]
    seqAlignedSeq = bestAlign.query[bestAlign.aligned[1][0][0]:bestAlign.
                                    aligned[1][-1][-1]]

    barcodeScore = aligner.align(barcode, seqAlignedSeq).score
    umiScore = barcodeUmiScore - barcodeScore
    return [
        str(x)
        for x in [barcodeUmiScore, barcodeScore, umiScore, mappingStrand]
    ]
Exemple #9
0
def matchAndScore(neo, db):
    """
    Find peptides in unmutated DB that match at P4,5,8 using regex
    then finds those with score >=1.25 at P6+7 using scoring matrix.

    :param neo: neoepitope string input
    :param db: unmutated db as pandas DF
    :return: Matching peptides with neoepitope as pandas DF
    """
    import re
    pattern = '^([A-Z]){3}' + neo[3] + neo[4] + '([A-Z]){2}' + neo[7]
    #get pos 4, 5, 8 matches
    pos_matches = db[db.str.contains(pattern, regex=True)].tolist()

    aligner = Align.PairwiseAligner()
    aligner.substitution_matrix = subMatrix()
    unmut_matches = []
    for match in pos_matches:
        score6 = aligner.score(neo[5], match[5])
        score7 = aligner.score(neo[6], match[6])
        score67 = aligner.score(neo[5:7], match[5:7])  #compare P6 and P7

        if (score67 >= 1.0 and score6 >= 0.25 and score7 >= 0.25):
            unmut_matches.append(match)

    scoredDF = pd.DataFrame()
    scoredDF['Peptide'] = unmut_matches
    scoredDF['Neoepitope'] = neo
    scoredDF['P4,5,8'] = neo[3] + neo[4] + neo[7]
    return scoredDF
Exemple #10
0
def map_seqs(obj, ref, segid_obj=None, segid_ref=None, matrix='BLOSUM62'):
    """
    given two sequences obj and ref 
    return a mapping dict map_obj2ref_fullseq={(segid,0-based pos):(segid,0-based pos)}
    """
    aligner = Align.PairwiseAligner()
    aligner.substitution_matrix = substitution_matrices.load(matrix)

    best_score = 0
    best_aln = 'no'
    i = 0
    for a in aligner.align(str(obj), str(ref)):
        if (a.score > best_score):
            best_score = a.score
            best_aln = a
        i = i + 1
        if i > 100:  # we analyze only first 100 alignments
            break

    t2q = {}

    for i, j in zip(best_aln.aligned[0], best_aln.aligned[1]):
        for x, y in zip(range(*i), range(*j)):
            t2q[x] = y
    if segid_obj is None:
        return t2q
    else:
        return {(segid_obj, k): (segid_ref, v) for k, v in t2q.items()}
Exemple #11
0
    def _align_clusters(config, one, two, cutoff=0.3):
        """Constructs a cluster alignment using the given configuration."""
        LOG.info("%s vs %s", one.name, two.name)

        aligner = Align.PairwiseAligner()

        # Select the substitution matrix.
        # Defaults to BLOSUM62 when none or invalid matrix specified.
        matrix = config.pop("substitution_matrix", "BLOSUM62")
        if matrix not in substitution_matrices.load():
            LOG.warning(
                "Invalid substitution matrix '(%s)', defaulting to BLOSUM62",
                matrix)
            matrix = "BLOSUM62"
        aligner.substitution_matrix = substitution_matrices.load(matrix)

        # ValueError is thrown during sequence alignment when a letter
        # in the sequence is not found in the substitution matrix.
        # Extended IUPAC codes (BXZJUO) are added to mitigate this.
        extend_matrix_alphabet(aligner.substitution_matrix, codes='BXZJUO')

        for k, v in config.items():
            setattr(aligner, k, v)

        alignment = Alignment(query=one, target=two)
        for locusA, locusB in product(one.loci, two.loci):
            for geneA, geneB in product(locusA.genes, locusB.genes):
                if not geneA.translation or not geneB.translation:
                    continue
                aln = aligner.align(geneA.translation, geneB.translation)
                identity, similarity = compute_identity(aln[0])
                if identity < cutoff:
                    continue
                alignment.add_link(geneA, geneB, identity, similarity)
        return alignment
Exemple #12
0
def replace_missing_residues(template_alignment, template_id, chain, pdb):
    template_pdb = [
        str(_.seq) for _ in SeqIO.parse(pdb, 'pdb-atom')
        if _.id == f'{template_id}:{chain}'
    ]
    try:
        template_pdb = template_pdb[0].replace('X', '')
    except IndexError:
        print(template_id)
        print(chain)

    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    alignment = next(
        aligner.align(template_alignment.replace('-', ''), template_pdb))
    a, _, b = str(alignment).splitlines()
    a = list(a)
    b = list(b)
    assert len(a) == len(b)
    for i, _ in enumerate(a):
        if b[i] == '-':
            a[i] = '@'
    a = ''.join(a).replace('-', '')
    a = list(a)
    for i, res in enumerate(template_alignment):
        if res == '-':
            a.insert(i, '-')
    a = ''.join(a).replace('@', '-')
    assert len(a) == len(template_alignment)
    return a
Exemple #13
0
def biopython_align(qseq, tseq, param, table=False, strict=False):

    # Query and target sequences.
    q = str(qseq.seq)
    t = str(tseq.seq)

    aligner = Align.PairwiseAligner()

    # Select local mode. Global, semiglobal are about scoring.
    if param.mode == const.LOCAL_ALIGN:
        aligner.mode = 'local'

    # Attempts to detect DNA vs peptide sequences.
    param.is_dna = all(x in "ATGC" for x in q[:100])

    # Default substituion matrix.
    if not param.matrix:
        param.matrix = 'NUC.4.4' if param.is_dna else 'BLOSUM62'

    # Apply substitution matrix.
    aligner.substitution_matrix = substitution_matrices.load(param.matrix)

    # Gap scoring.
    aligner.open_gap_score = -param.gap_open
    aligner.extend_gap_score = -param.gap_extend

    # End gap scoring.
    if strict:
        aligner.target_end_open_gap_score = -param.gap_open
        aligner.target_end_extend_gap_score = -param.gap_extend

        aligner.query_end_open_gap_score = -param.gap_open
        aligner.query_end_extend_gap_score = -param.gap_extend
    else:
        aligner.target_end_gap_score = 0.0
        aligner.query_end_gap_score = 0.0

    # Semiglobal will override strict mode.
    if param.mode == const.SEMIGLOBAL_ALIGN:
        aligner.target_end_gap_score = 0.0
        aligner.query_end_gap_score = 0.0

    # Biopython alignment target to query.
    alns = aligner.align(t, q)

    # Reformat alignments as a more detailed class.
    def builder(aln):
        rec = Alignment(qseq=qseq, tseq=tseq, aln=aln, param=param)
        return rec

    alns = map(builder, alns)

    # Format the aligners
    if table:
        print_func = print_tabular
    else:
        print_func = print_pairwise

    for index, aln in enumerate(alns):
        print_func(aln, param=param, index=index)
Exemple #14
0
 def __init__(self):
     self.seqa = ''
     self.seqb = ''
     self.aligner = Align.PairwiseAligner()
     self.aligner.mode = 'global'
     self.aligner.open_gap_score = -10
     self.aligner.extend_gap_score = -0.5
     self.aligner.substitution_matrix = matlist.blosum62
     self.alignment_count = 0
Exemple #15
0
def align(seq1, seq2):
    aligner = Align.PairwiseAligner()
    aligner.mode = "local"
    aligner.open_gap_score = -4
    aligner.extend_gap_score = -2
    aligner.match = 2
    aligner.mismatch = -3
    score = aligner.score(seq1, seq2)
    return score
def test_poly_a():

    for poly_a_min_score in range(5, 11):
        for poly_a_win in range(5, 31, 5):

            pstv_counter = 0
            tp_counter = 0
            fp_counter = 0

            for record in SeqIO.parse(
                    "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl",
                    "embl"):
                mark1 = 'SINE'
                mark2 = 'LINE'
                if mark1 in record.annotations[
                        'comment'] or mark2 in record.annotations['comment']:
                    pstv_counter += 1

                aligner = Align.PairwiseAligner()
                aligner.mode = 'local'
                aligner.open_gap_score = -1.0
                aligner.extend_gap_score = -1.0
                aligner.mismatch = -1

                score = aligner.score(record.seq[-poly_a_win:-1],
                                      poly_a_win * "a")
                if score < poly_a_min_score:
                    # Search for poly-A tail in the negative strand
                    score = aligner.score(str(record.seq[0:poly_a_win]),
                                          poly_a_win * "t")

                if score >= poly_a_min_score and (
                        mark1 in record.annotations['comment']
                        or mark2 in record.annotations['comment']):
                    tp_counter += 1

                if score >= poly_a_min_score and not (
                        mark1 in record.annotations['comment']
                        or mark2 in record.annotations['comment']):
                    fp_counter += 1

            sensitivity = 100 * tp_counter / pstv_counter
            if tp_counter > 0 or fp_counter > 0:
                precision = 100 * tp_counter / (tp_counter + fp_counter)
            else:
                precision = 0

            if sensitivity > 0 or precision > 0:
                f_measure = 2 * (sensitivity * precision) / (sensitivity +
                                                             precision)
            else:
                f_measure = 0

            print('Window:', poly_a_win, 'Min score:', poly_a_min_score,
                  'Sensitivity:', sensitivity, 'Precision:', precision,
                  'F-measure:', f_measure)
Exemple #17
0
    def __init__(self, reference: str, sequence: str):
        self.aligner = Align.PairwiseAligner()
        self.aligner.mode = 'global'
        self.aligner.open_gap_score = -0.5
        self.aligner.extend_gap_score = -0.1
        self.aligner.target_end_gap_score = 0.0
        self.aligner.query_end_gap_score = 0.0

        self.reference = reference
        self.sequence = sequence
Exemple #18
0
def BLOSUM45_score_dist(s1, s2):
    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = -10
    aligner.substitution_matrix = substitution_matrices.load("BLOSUM45")
    aligner.mode = "global"
    score_s12 = aligner.score(s1, s2)
    score11 = aligner.score(s1, s1)
    score22 = aligner.score(s2, s2)
    distance = 1 - score_s12 / max(score11, score22)
    return distance
Exemple #19
0
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2):
    #within length limit
    if not sv.is_third_fil:
        aligner = Align.PairwiseAligner()
        aligner.mode = 'global'
        #aligner.mode = 'local'
        aligner.match_score = 1
        aligner.mismatch_score = -1
        aligner.open_gap_score = -1
        aligner.extend_gap_score = -0.5
        #aligner.score_only = True
        alignment_beforeSV = aligner.score(query_seq, ref_seq_1)
        alignment_afterSV = aligner.score(query_seq, ref_seq_2)
    else:
        h = open(output_dir + "tmp_query.fasta", "w")
        h.write('>' + str(sv.idx) + "\n")
        h.write(query_seq + "\n")
        h.close()
        #         aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1])
        aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta")
        #if not alignment: raise Exception("ERROR: failed to load/build index")
        aligner_beforeSV = aligner.map(ref_seq_1,
                                       seq2=None,
                                       cs=False,
                                       MD=False)
        aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False)

        #test
        #         for agt in aligner_beforeSV:
        #             alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen)
        #             break
        #         for agt in aligner_afterSV:
        #             alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen)
        #             break

        try:
            agt_before = next(aligner_beforeSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        try:
            agt_after = next(aligner_afterSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        alignment_beforeSV = len(query_seq) - (len(ref_seq_1) -
                                               agt_before.mlen)
        alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen)

        os.remove(output_dir + "tmp_query.fasta")

    return alignment_beforeSV, alignment_afterSV
Exemple #20
0
def get_align_score(seq1, seq2):
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    #aligner.mode = 'local'
    aligner.match_score = 1
    aligner.mismatch_score = -1
    aligner.open_gap_score = -1
    aligner.extend_gap_score = -0.5
    #aligner.score_only = True
    alignment_score = aligner.score(seq1, seq2)
    return alignment_score
Exemple #21
0
def init_basic_aligner(allow_mismatches=False):
    """Returns an aligner with minimal assumptions about gaps."""
    a = Align.PairwiseAligner()
    if allow_mismatches:
        a.mismatch_score = -1
        a.gap_score = -3
        a.target_gap_score = -np.inf
    if not allow_mismatches:
        a.mismatch = -np.inf
        a.mismatch_score = -np.inf
    return a
Exemple #22
0
def setupAligner(
    match, mismatch, open, extend
):  #create an aligner to attempt to simulate a semi-global aligment, as it is some what more accurate than pairwise2.globalms
    a = Align.PairwiseAligner()  #the aligner  itself
    a.match_score = match  #set the aligner score based on the given condition
    a.mismatch_score = mismatch
    a.internal_open_gap_score = open  #internal gap open
    a.internal_extend_gap_score = extend  #internal extending gap
    a.target_left_open_gap_score = open  #left gap open
    a.target_left_extend_gap_score = extend  #left extend
    return a
Exemple #23
0
 def __init__(self, reference: str, query: str, config: dict):
     self.aligner = Align.PairwiseAligner()
     self.reference = reference
     self.query = query
     self.aligner.mode = config.get('mode', 'global')
     self.aligner.open_gap_score = config.get('open_gap_score', -0.5)
     self.aligner.extend_gap_score = config.get('extend_gap_score', -0.1)
     self.aligner.target_end_gap_score = config.get('target_end_gap_score',
                                                    0.0)
     self.aligner.query_end_gap_score = config.get('query_end_gap_score',
                                                   0.0)
def alignSeqs(keySequences, editDistance, count):
    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = -0.5
    aligner.extend_gap_score = -0.5
    minScore = 15 - editDistance
    i = keySequences[int(count)]
    countj = int(int(count) + 1)
    for j in keySequences[countj:]:
        alignments = aligner.align(i, j)
        if alignments.score >= minScore and alignments.score < 15:
            alignment = alignments[0]
            return i, j
Exemple #25
0
    def __init__(self, aligner_config=None):
        self.alignments = []
        self.aligner = Align.PairwiseAligner()
        self.clusters = OrderedDict()

        self._alignment_indices = defaultdict(dict)
        self._cluster_names = defaultdict(dict)

        if aligner_config:
            self.configure_aligner(**aligner_config)
        else:
            self.configure_aligner(**self.aligner_default)
Exemple #26
0
def sequence_aligner(sequence_id, reference, sequence, chr_name, snpeff_database_name, annotation_file):
    aligner = Align.PairwiseAligner()
    aligner.match_score = 3.0  # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work
    aligner.mismatch_score = -2.0
    aligner.open_gap_score = -2.5
    aligner.extend_gap_score = -1

    alignments = sorted(list(aligner.align(reference, sequence)),
                        key = lambda x: len(str(x).strip().split('\n')[2].strip("-")))
    alignment = str(alignments[0]).strip().split('\n')
    ref_aligned = alignment[0]
    seq_aligned = alignment[2]
    print(f'#\n#\n#Pipeline: {"Alignment done"} \n#\n#')
    ref_positions = np.zeros(len(seq_aligned), dtype=int)

    pos = 0
    for i in range(len(ref_aligned)):
        if ref_aligned[i] != '-':
            pos += 1
        ref_positions[i] = pos

    seq_positions = np.zeros(len(seq_aligned), dtype=int)

    pos = 0
    for i in range(len(seq_aligned)):
        if seq_aligned[i] != '-':
            pos += 1
        seq_positions[i] = pos

    annotated_variants = call_nucleotide_variants(sequence_id,
                                                  reference,
                                                  sequence,
                                                  ref_aligned,
                                                  seq_aligned,
                                                  ref_positions,
                                                  seq_positions,
                                                  chr_name,
                                                  snpeff_database_name
                                                  )
    print(f'#\n#\n#Pipeline: {"Nuc variant called"} \n#\n#')

    annotations = filter_ann_and_variants(
        call_annotation_variant(annotation_file,
                                ref_aligned,
                                seq_aligned,
                                ref_positions,
                                seq_positions
                                )
    )
    print(f'#\n#\n#Pipeline: {"AA variant called"} \n#\n#')

    return annotated_variants, annotations
Exemple #27
0
def populate_from_pair(g_1, g_2, trans_m, emiss_m, N = 10):
	'''
	Popunjava trans i emis matricu iz najvise N optimalnih poravnanja koja se dobivaju
	od genoma g1 i g2. Tablicu popunjavaju pojavama(count) te se ne radi pretovrba u vjerojatnosu
	matricu
	'''
	aligner = Align.PairwiseAligner()
	alignments = aligner.align(g_1, g_2)
	
	for i,alignment in enumerate(alignments):
		#print(alignment)
		if N is not None and i>=N:
			break
		populate_from_aligment(alignment, trans_m, emiss_m)
Exemple #28
0
def get_nuc_aligner() -> Align.PairwiseAligner:
    from Bio.Align.substitution_matrices import Array
    aligner = Align.PairwiseAligner()
    aligner.match_score = 3.0  # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work
    aligner.mismatch_score = -2.1
    aligner.open_gap_score = -2.5
    aligner.extend_gap_score = -1

    aligner.right_extend_gap_score = 0
    aligner.left_extend_gap_score = 0
    aligner.right_open_gap_score = 0
    aligner.left_open_gap_score = 0

    match_scores = {1: aligner.match_score,
                    3: 2,
                    10: 1.5,
                    16: 1}

    dd = {
        "a": "a",
        "g": "g",
        "c": "c",
        "t": "t",
        # len 3
        "y": "cty",
        "r": "agr",
        "w": "atw",
        "s": "gcs",
        "k": "tgk",
        "m": "cam",
        # len 10
        "d": "agtd" + "yrwskm",
        "v": "acgv" + "yrwskm",
        "h": "acth" + "yrwskm",
        "b": "cgtb" + "yrwskm",
        # len 16
        "n": "agctyrwskmdvhbnx",
        "x": "agctyrwskmdvhbnx",
    }
    extra_characters = ""
    all_characters = "".join(dd) + extra_characters
    matrix = Array(alphabet=all_characters, dims=2,
                   data=np.ones((len(all_characters), len(all_characters))) * aligner.mismatch_score)
    for x, chrs in dd.items():
        score = match_scores[len(chrs)]
        for y in chrs:
            matrix[x, y] = matrix[y, x] = score
    aligner.substitution_matrix = matrix
    return aligner
def test_mite():
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'
    aligner.open_gap_score = -1.0
    aligner.extend_gap_score = -1.0
    aligner.mismatch = -1.0

    for ir_min_score in range(5, 11):
        for lt_win in range(10, 50, 5):
            dna_counter = 0
            tp_counter = 0
            fp_counter = 0

            rt_win = lt_win

            for record in SeqIO.parse(
                    "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl",
                    "embl"):
                mark = 'Type: DNA'
                if mark in record.annotations['comment']:
                    dna_counter += 1

                rc = record.seq[-lt_win:-1].reverse_complement()
                score = aligner.score(str(record.seq[0:rt_win]), str(rc))
                # print(score)

                if score >= ir_min_score and mark in record.annotations[
                        'comment']:
                    tp_counter += 1

                if score >= ir_min_score and not mark in record.annotations[
                        'comment']:
                    fp_counter += 1

            sensitivity = 100 * tp_counter / dna_counter
            if tp_counter > 0 or fp_counter > 0:
                precision = 100 * tp_counter / (tp_counter + fp_counter)
            else:
                precision = 0

            if sensitivity > 0 or precision > 0:
                f_measure = 2 * (sensitivity * precision) / (sensitivity +
                                                             precision)
            else:
                f_measure = 0

            print('Left window:', lt_win, 'Right window:', rt_win,
                  'Min score:', ir_min_score, 'Sensitivity:', sensitivity,
                  'Precision:', precision, 'F-measure:', f_measure)
Exemple #30
0
def Needleman_Wunsch_alignment(seq1, seq2):
    '''
    Function for doing global alignment between seq1 and seq2 using Needleman-Wunsch algorithm implemented in Biopython
    '''
    missing = None
    if "-" in seq1:
        # Need to handle "-" beforehand, otherwise the alignment may fail
        missing = [s == "-" for s in seq1]
        seq1 = seq1.replace("-", "")
    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = -10
    aligner.extend_gap_score = -0.5
    aligner.substitution_matrix = blosum62
    alignment = aligner.align(seq1, seq2)[0]
    alignment_info = alignment.__str__().split("\n")
    aligned1, aligned2 = alignment_info[0], alignment_info[2]
    if missing is None:
        final1 = aligned1
        final2 = aligned2
    else:
        # Assign alignment with "-"
        final1_temp = ""
        final2_temp = ""
        j = 0
        for s in missing:
            if s:
                final1_temp += "-"
                final2_temp += "-"
            else:
                while aligned1[j] == "-" and j < len(aligned1):
                    final1_temp += aligned1[j]
                    final2_temp += aligned2[j]
                    j += 1
                if j < len(aligned1):
                    final1_temp += aligned1[j]
                    final2_temp += aligned2[j]
                    j += 1
        if j < len(aligned1):
            final1_temp += aligned1[j:]
            final2_temp += aligned2[j:]
        # Cleaning up
        final1 = ""
        final2 = ""
        for i in range(len(final1_temp)):
            if not (final1_temp[i] == "-" and final2_temp[i] == "-"):
                final1 += final1_temp[i]
                final2 += final2_temp[i]
    return final1, final2