def _save_alignment(self):
     aln = Align.MultipleSeqAlignment([
         SeqRecord.SeqRecord(
             Seq.Seq("".join(n.sequence)),
             id=n.name,
             name=n.name,
             description="",
         ) for n in self.tree.find_clades()
     ])
     with myopen(os.path.join(self._root_dir, out_aln_fasta), "w") as ofile:
         AlignIO.write(aln, ofile, "fasta")
Example #2
0
def setupAligner(
    match, mismatch, open, extend
):  #create an aligner to attempt to simulate a semi-global aligment, as it is some what more accurate than pairwise2.globalms
    a = Align.PairwiseAligner()  #the aligner  itself
    a.match_score = match  #set the aligner score based on the given condition
    a.mismatch_score = mismatch
    a.internal_open_gap_score = open  #internal gap open
    a.internal_extend_gap_score = extend  #internal extending gap
    a.target_left_open_gap_score = open  #left gap open
    a.target_left_extend_gap_score = extend  #left extend
    return a
Example #3
0
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2):
    #within length limit
    if not sv.is_third_fil:
        aligner = Align.PairwiseAligner()
        aligner.mode = 'global'
        #aligner.mode = 'local'
        aligner.match_score = 1
        aligner.mismatch_score = -1
        aligner.open_gap_score = -1
        aligner.extend_gap_score = -0.5
        #aligner.score_only = True
        alignment_beforeSV = aligner.score(query_seq, ref_seq_1)
        alignment_afterSV = aligner.score(query_seq, ref_seq_2)
    else:
        h = open(output_dir + "tmp_query.fasta", "w")
        h.write('>' + str(sv.idx) + "\n")
        h.write(query_seq + "\n")
        h.close()
        #         aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1])
        aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta")
        #if not alignment: raise Exception("ERROR: failed to load/build index")
        aligner_beforeSV = aligner.map(ref_seq_1,
                                       seq2=None,
                                       cs=False,
                                       MD=False)
        aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False)

        #test
        #         for agt in aligner_beforeSV:
        #             alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen)
        #             break
        #         for agt in aligner_afterSV:
        #             alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen)
        #             break

        try:
            agt_before = next(aligner_beforeSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        try:
            agt_after = next(aligner_afterSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        alignment_beforeSV = len(query_seq) - (len(ref_seq_1) -
                                               agt_before.mlen)
        alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen)

        os.remove(output_dir + "tmp_query.fasta")

    return alignment_beforeSV, alignment_afterSV
Example #4
0
def load_aln(infile):

    aln = Align.MultipleSeqAlignment([])
    aln_dict = {}

    with open(infile, 'r') as f:
        for seq_record in SeqIO.parse(f, 'fasta'):
            aln.append(seq_record)
            aln_dict[seq_record.id] = str(seq_record.seq)

    return aln, aln_dict
Example #5
0
def get_align_score(seq1, seq2):
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    #aligner.mode = 'local'
    aligner.match_score = 1
    aligner.mismatch_score = -1
    aligner.open_gap_score = -1
    aligner.extend_gap_score = -0.5
    #aligner.score_only = True
    alignment_score = aligner.score(seq1, seq2)
    return alignment_score
Example #6
0
def alignment(seqs_in, profile, run_id):
    '''Compute an alignment of multiple sequences to a given 
covariance model profile such as constructed by cmbuild
via infernal.profiles.

input:
  seqs:    a list of biopython SeqRecord objects
  profile: the filename of a covariance model profile
  run_id:  a run id to use for naming temporary files to avoid collisions

output:
  ali:     an rfam multiple sequence alignment
  ref:     the profile reference sequence aligned to ali
  struct:  the profile reference structure aligned to ali

'''
    if type(seqs_in[0]) == str:
        raise Exception(
            'Sorry but string lists are not supported. We need ids!')
        #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s,
        #                                            Bio.Seq.Alphabet.RNAAlphabet),
        #                                'S{0:03}'.format(idx))
        #        for idx, s in enumerate(seqs)]
    else:
        seqs = [
            Bio.SeqRecord.SeqRecord(
                Bio.Seq.Seq(
                    ''.join([let for let in str(ali.seq) if let in 'AUTGC']),
                    Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx))
            for idx, ali in enumerate(seqs_in)
        ]

    name_maps = dict([('S{0:03}'.format(idx), s.id)
                      for idx, s in enumerate(seqs_in)])

    infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format(
        run_id, idx))
    outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format(
        run_id, idx))
    Bio.SeqIO.write(seqs, infile, 'fasta')

    cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile)
    ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE)
    out = ispc.communicate()[0]
    fopen = open(outfile)
    seqs, ref, struct = rutils.stk_parse(fopen)
    fopen.close()
    ali = ba.MultipleSeqAlignment(seqs)

    for a in ali:
        a.seq = a.seq.upper()
        a.id = name_maps[a.id]

    return ali, ref, struct
Example #7
0
 def __init__(self, reference: str, query: str, config: dict):
     self.aligner = Align.PairwiseAligner()
     self.reference = reference
     self.query = query
     self.aligner.mode = config.get('mode', 'global')
     self.aligner.open_gap_score = config.get('open_gap_score', -0.5)
     self.aligner.extend_gap_score = config.get('extend_gap_score', -0.1)
     self.aligner.target_end_gap_score = config.get('target_end_gap_score',
                                                    0.0)
     self.aligner.query_end_gap_score = config.get('query_end_gap_score',
                                                   0.0)
Example #8
0
def init_basic_aligner(allow_mismatches=False):
    """Returns an aligner with minimal assumptions about gaps."""
    a = Align.PairwiseAligner()
    if allow_mismatches:
        a.mismatch_score = -1
        a.gap_score = -3
        a.target_gap_score = -np.inf
    if not allow_mismatches:
        a.mismatch = -np.inf
        a.mismatch_score = -np.inf
    return a
Example #9
0
    def __init__(self, aligner_config=None):
        self.alignments = []
        self.aligner = Align.PairwiseAligner()
        self.clusters = OrderedDict()

        self._alignment_indices = defaultdict(dict)
        self._cluster_names = defaultdict(dict)

        if aligner_config:
            self.configure_aligner(**aligner_config)
        else:
            self.configure_aligner(**self.aligner_default)
def alignSeqs(keySequences, editDistance, count):
    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = -0.5
    aligner.extend_gap_score = -0.5
    minScore = 15 - editDistance
    i = keySequences[int(count)]
    countj = int(int(count) + 1)
    for j in keySequences[countj:]:
        alignments = aligner.align(i, j)
        if alignments.score >= minScore and alignments.score < 15:
            alignment = alignments[0]
            return i, j
Example #11
0
def sequence_aligner(sequence_id, reference, sequence, chr_name, snpeff_database_name, annotation_file):
    aligner = Align.PairwiseAligner()
    aligner.match_score = 3.0  # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work
    aligner.mismatch_score = -2.0
    aligner.open_gap_score = -2.5
    aligner.extend_gap_score = -1

    alignments = sorted(list(aligner.align(reference, sequence)),
                        key = lambda x: len(str(x).strip().split('\n')[2].strip("-")))
    alignment = str(alignments[0]).strip().split('\n')
    ref_aligned = alignment[0]
    seq_aligned = alignment[2]
    print(f'#\n#\n#Pipeline: {"Alignment done"} \n#\n#')
    ref_positions = np.zeros(len(seq_aligned), dtype=int)

    pos = 0
    for i in range(len(ref_aligned)):
        if ref_aligned[i] != '-':
            pos += 1
        ref_positions[i] = pos

    seq_positions = np.zeros(len(seq_aligned), dtype=int)

    pos = 0
    for i in range(len(seq_aligned)):
        if seq_aligned[i] != '-':
            pos += 1
        seq_positions[i] = pos

    annotated_variants = call_nucleotide_variants(sequence_id,
                                                  reference,
                                                  sequence,
                                                  ref_aligned,
                                                  seq_aligned,
                                                  ref_positions,
                                                  seq_positions,
                                                  chr_name,
                                                  snpeff_database_name
                                                  )
    print(f'#\n#\n#Pipeline: {"Nuc variant called"} \n#\n#')

    annotations = filter_ann_and_variants(
        call_annotation_variant(annotation_file,
                                ref_aligned,
                                seq_aligned,
                                ref_positions,
                                seq_positions
                                )
    )
    print(f'#\n#\n#Pipeline: {"AA variant called"} \n#\n#')

    return annotated_variants, annotations
Example #12
0
def remove_positions_with_gaps_in_first_sequence(input_fasta, output_fasta):
    # removes all positions with gaps in the first sequence
    aln = AlignIO.read(str(input_fasta), 'fasta')
    first_sequence = str(aln[0].seq)
    good_positions = [
        k for k in range(len(first_sequence)) if first_sequence[k] != '-'
    ]
    first_pos = good_positions[0]
    clean_aln = Align.MultipleSeqAlignment(aln[:, first_pos:first_pos + 1])
    for pos in good_positions[1:]:
        clean_aln += aln[:, pos:pos + 1]
    AlignIO.write(clean_aln, str(output_fasta), 'fasta')
    return output_fasta
Example #13
0
def load_aln_to_repair(infile, omit):

    aln = Align.MultipleSeqAlignment([])
    aln_dict = {}

    with open(infile, 'r') as f:
        for seq_record in SeqIO.parse(f, 'fasta'):

            aln_dict[seq_record.id] = str(seq_record.seq)

            if seq_record.name not in omit:
                aln.append(seq_record)

    return aln, aln_dict
Example #14
0
def save_alignment(tt: TreeTime, config: TreetimeConfig):
    records = [
        SeqRecord.SeqRecord(
            Seq.Seq("".join(n.sequence)),
            id=n.name,
            name=n.name,
            description="",
        ) for n in tt.tree.find_clades()
    ]

    aln = Align.MultipleSeqAlignment(records)

    with open(config.output_filenames.FASTA, "w") as ofile:
        AlignIO.write(aln, ofile, "fasta")
Example #15
0
def populate_from_pair(g_1, g_2, trans_m, emiss_m, N = 10):
	'''
	Popunjava trans i emis matricu iz najvise N optimalnih poravnanja koja se dobivaju
	od genoma g1 i g2. Tablicu popunjavaju pojavama(count) te se ne radi pretovrba u vjerojatnosu
	matricu
	'''
	aligner = Align.PairwiseAligner()
	alignments = aligner.align(g_1, g_2)
	
	for i,alignment in enumerate(alignments):
		#print(alignment)
		if N is not None and i>=N:
			break
		populate_from_aligment(alignment, trans_m, emiss_m)
Example #16
0
def get_nuc_aligner() -> Align.PairwiseAligner:
    from Bio.Align.substitution_matrices import Array
    aligner = Align.PairwiseAligner()
    aligner.match_score = 3.0  # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work
    aligner.mismatch_score = -2.1
    aligner.open_gap_score = -2.5
    aligner.extend_gap_score = -1

    aligner.right_extend_gap_score = 0
    aligner.left_extend_gap_score = 0
    aligner.right_open_gap_score = 0
    aligner.left_open_gap_score = 0

    match_scores = {1: aligner.match_score,
                    3: 2,
                    10: 1.5,
                    16: 1}

    dd = {
        "a": "a",
        "g": "g",
        "c": "c",
        "t": "t",
        # len 3
        "y": "cty",
        "r": "agr",
        "w": "atw",
        "s": "gcs",
        "k": "tgk",
        "m": "cam",
        # len 10
        "d": "agtd" + "yrwskm",
        "v": "acgv" + "yrwskm",
        "h": "acth" + "yrwskm",
        "b": "cgtb" + "yrwskm",
        # len 16
        "n": "agctyrwskmdvhbnx",
        "x": "agctyrwskmdvhbnx",
    }
    extra_characters = ""
    all_characters = "".join(dd) + extra_characters
    matrix = Array(alphabet=all_characters, dims=2,
                   data=np.ones((len(all_characters), len(all_characters))) * aligner.mismatch_score)
    for x, chrs in dd.items():
        score = match_scores[len(chrs)]
        for y in chrs:
            matrix[x, y] = matrix[y, x] = score
    aligner.substitution_matrix = matrix
    return aligner
def test_mite():
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'
    aligner.open_gap_score = -1.0
    aligner.extend_gap_score = -1.0
    aligner.mismatch = -1.0

    for ir_min_score in range(5, 11):
        for lt_win in range(10, 50, 5):
            dna_counter = 0
            tp_counter = 0
            fp_counter = 0

            rt_win = lt_win

            for record in SeqIO.parse(
                    "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl",
                    "embl"):
                mark = 'Type: DNA'
                if mark in record.annotations['comment']:
                    dna_counter += 1

                rc = record.seq[-lt_win:-1].reverse_complement()
                score = aligner.score(str(record.seq[0:rt_win]), str(rc))
                # print(score)

                if score >= ir_min_score and mark in record.annotations[
                        'comment']:
                    tp_counter += 1

                if score >= ir_min_score and not mark in record.annotations[
                        'comment']:
                    fp_counter += 1

            sensitivity = 100 * tp_counter / dna_counter
            if tp_counter > 0 or fp_counter > 0:
                precision = 100 * tp_counter / (tp_counter + fp_counter)
            else:
                precision = 0

            if sensitivity > 0 or precision > 0:
                f_measure = 2 * (sensitivity * precision) / (sensitivity +
                                                             precision)
            else:
                f_measure = 0

            print('Left window:', lt_win, 'Right window:', rt_win,
                  'Min score:', ir_min_score, 'Sensitivity:', sensitivity,
                  'Precision:', precision, 'F-measure:', f_measure)
Example #18
0
def separate_alignments(msa_data,
                        sus_ids,
                        out_dir,
                        filename,
                        patient_zero='NC_045512.2'):
    good_seqs = []
    poor_seqs = []
    for rec in msa_data:
        if rec.id in sus_ids:
            poor_seqs.append(rec)
        elif rec.id == patient_zero:
            good_seqs.append(rec)
            poor_seqs.append(rec)
        else:
            good_seqs.append(rec)
    good_msa = Align.MultipleSeqAlignment(good_seqs)
    good_msa_fn = filename + '_aligned_white.fa'
    good_msa_fp = out_dir / good_msa_fn
    AlignIO.write(good_msa, good_msa_fp, 'fasta')
    poor_msa = Align.MultipleSeqAlignment(poor_seqs)
    poor_msa_fn = filename + '_aligned_inspect.fa'
    poor_msa_fp = out_dir / poor_msa_fn
    AlignIO.write(poor_msa, poor_msa_fp, 'fasta')
    return 0
Example #19
0
def Needleman_Wunsch_alignment(seq1, seq2):
    '''
    Function for doing global alignment between seq1 and seq2 using Needleman-Wunsch algorithm implemented in Biopython
    '''
    missing = None
    if "-" in seq1:
        # Need to handle "-" beforehand, otherwise the alignment may fail
        missing = [s == "-" for s in seq1]
        seq1 = seq1.replace("-", "")
    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = -10
    aligner.extend_gap_score = -0.5
    aligner.substitution_matrix = blosum62
    alignment = aligner.align(seq1, seq2)[0]
    alignment_info = alignment.__str__().split("\n")
    aligned1, aligned2 = alignment_info[0], alignment_info[2]
    if missing is None:
        final1 = aligned1
        final2 = aligned2
    else:
        # Assign alignment with "-"
        final1_temp = ""
        final2_temp = ""
        j = 0
        for s in missing:
            if s:
                final1_temp += "-"
                final2_temp += "-"
            else:
                while aligned1[j] == "-" and j < len(aligned1):
                    final1_temp += aligned1[j]
                    final2_temp += aligned2[j]
                    j += 1
                if j < len(aligned1):
                    final1_temp += aligned1[j]
                    final2_temp += aligned2[j]
                    j += 1
        if j < len(aligned1):
            final1_temp += aligned1[j:]
            final2_temp += aligned2[j:]
        # Cleaning up
        final1 = ""
        final2 = ""
        for i in range(len(final1_temp)):
            if not (final1_temp[i] == "-" and final2_temp[i] == "-"):
                final1 += final1_temp[i]
                final2 += final2_temp[i]
    return final1, final2
Example #20
0
    def __init__(self, aligner_config=None):
        # Lookup dictionaries
        self._genes = {}
        self._loci = {}
        self._links = {}
        self._alignment_indices = defaultdict(dict)
        self._cluster_names = defaultdict(dict)

        self.alignments = {}
        self.aligner = Align.PairwiseAligner()
        self.clusters = OrderedDict()

        if aligner_config:
            self.configure_aligner(**aligner_config)
        else:
            self.configure_aligner(**self.aligner_default)
Example #21
0
def test_align():
    """
    Determining how a string matches is a key challenge...

    """

    aligner = Align.PairwiseAligner()
    searchstr = 'ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG'
    searchfor = 'GAATGG'
    alignments = aligner.align(searchstr, searchfor)
    for alignment in alignments:
        print(alignment)

    score = aligner.score
    print(score)
    print(aligner)
Example #22
0
def get_aligned_subsequence(ref='jejuni'):
 alignments={}
 for key in sequences:
  print(key)
  if key==ref:
    continue
  #print(sequences[ref])
  seqref = ''.join(sequences[ref].split())
  seq = ''.join(sequences[key].split())
  aligner = Align.PairwiseAligner()
  alignments[key]=aligner.align(seqref,seq) #target,query

 subsequences_aligned={}
 for key in alignments:
  subsequences_aligned[key] = alignments[key][0].aligned

 return subsequences_aligned
    def pairwise_aligner(self):

        aligner = Align.PairwiseAligner()
        aligner.open_gap_score = -10
        aligner.extend_gap_score = -0.5
        aligner.target_end_gap_score = 0.0
        aligner.query_end_gap_score = 0.0
        # aligner.match = 2
        # aligner.mismatch = -1
        # only need to run aligner.score. This improves memory usage and speed.
        # alignments = aligner.align(self.sequence1, self.sequence2)
        # for alignment in sorted(alignments):
        #    logging.debug(alignment)
        align_score = aligner.score(self.sequence1, self.sequence2)
        #logging.info(align_score)

        self.score = align_score
Example #24
0
def main():
    q = Queue(connection=conn)
    mat_name = "BLOSUM62"
    matrix = substitution_matrices.load(mat_name)
    aligner = Align.PairwiseAligner()
    # aligner.substitution_matrix = matrix
    job = q.enqueue(global_align, args=(aligner, x, y, matrix))
    # alignments = global_align()
    count = 0
    while True:
        if job.result != None or count > 100000:
            break
        time.sleep(2)
        count += 1
        print(f'job.get_id(): {job.get_id()}, ' f'job.result:{job.result}')
    alignments = job.result
    print(f'alignments[0]:{alignments[0]}\n score: {alignments[0].score}')
Example #25
0
def compute_similarity_kernel_matrices(dataset):
    """
    Computes the drug-drug and protein-protein kernel matrices for kernel-based methods (e.g. Kron-RLS)

    :param dataset:
    :return: tuple
    """
    start = time.time()
    print("Computing kernel matrices (KD_dict, KT_dict)")
    all_comps = set()
    all_prots = set()
    for idx, pair in enumerate(dataset.X):
        mol, prot = pair
        all_comps.add(mol)
        all_prots.add(prot)

    # compounds / drugs
    comps_mat = {}
    for c1 in all_comps:
        fp1 = c1.fingerprint
        for c2 in all_comps:
            fp2 = c2.fingerprint
            # Tanimoto coefficient
            score = DataStructs.TanimotoSimilarity(fp1, fp2)
            comps_mat[Pair(c1, c2)] = score

    # proteins / targets
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'  # SW algorithm
    prots_mat = {}
    for p1 in all_prots:
        seq1 = p1.sequence[1]
        p1_score = aligner.score(seq1, seq1)
        for p2 in all_prots:
            seq2 = p2.sequence[1]
            p2_score = aligner.score(seq2, seq2)
            score = aligner.score(seq1, seq2)
            # Normalized SW score
            prots_mat[Pair(p1, p2)] = score / (sqrt(p1_score) * sqrt(p2_score))

    print("Kernel entities: Drugs={}, Prots={}".format(len(all_comps),
                                                       len(all_prots)))
    duration = time.time() - start
    print("Kernel matrices computation finished in: {:.0f}m {:.0f}s".format(
        duration // 60, duration % 60))
    return comps_mat, prots_mat
Example #26
0
def fetch_seqs(seqs_filepath, out_fp, sample_idxs: list, is_aligned=False, is_gzip=False):
    if is_aligned:
        if is_gzip:
            with gzip.open(seqs_filepath, "rt") as handle:
                cns = AlignIO.read(handle, 'fasta')
        else:
            cns = AlignIO.read(seqs_filepath, 'fasta')
        my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs])
        return AlignIO.write(my_cns, out_fp, 'fasta')
    else:
        if is_gzip:
            with gzip.open(seqs_filepath, "rt") as handle:
                cns = SeqIO.parse(handle, 'fasta')
        else:
            cns = SeqIO.parse(seqs_filepath, 'fasta')
        my_cns = [rec for rec in cns if rec.id in sample_idxs]
        return SeqIO.write(my_cns, out_fp, 'fasta')
def consensus_from_alignment (align): ## IUPAC ambiguity codes
    if ambiguous_dna: ## biopython < 1.78
        xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N"), ambiguous_dna), id=rec.id, description=rec.description) for rec in align]
    else:
        xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N")), id=rec.id, description=rec.description) for rec in align]
    summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(xaln)) # must be an MSA, not a list
    pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=["-"])
    consensus = [];
    # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"...
    for score in pssm: # we don't care about frequency, only presence
        # base can be "R", then iupac.dna_values[R] = [A,G]
        acgt_list = [x for base, count in score.items() for x in IUPACData.ambiguous_dna_values[base] if count > 0]
        consensus.append(iupac_dna[ ''.join(sorted(set(acgt_list))) ])
    if ambiguous_dna:
        return Seq.Seq(''.join(consensus),ambiguous_dna)
    else:
        return Seq.Seq(''.join(consensus))
Example #28
0
def pop_row(aln, seqid):
    ''' Pop a row from an alignment by sequence id

        aln: a Bio.Align.MultipleSeqAlignment object
        seqid: id of Bio.SeqRecord.SeqRecord to pop from aln

        Returns a tuple containing the popped SeqRecord and a
        copy of aln without seqid's SeqRecord.

    '''

    aln_d = SeqIO.to_dict(aln)
    seq = aln_d[seqid]
    del aln_d[seqid]
    aln = Align.MultipleSeqAlignment(aln_d.itervalues())

    return seq, aln
Example #29
0
def main():
    global f_ab, f_extra, bt_positions
    seqs = {}
    records = []
    fname_list = [basename(fpath) for fpath in options.input_files]

    with open(options.output.replace(".abbababa", ".flist"), "w") as fout:
        fout.write("\n".join(options.input_files))

    for fpath in options.input_files:
        fname = basename(fpath)
        seqs[fname] = SeqIO.index(fpath, "fasta")
        records_per_fasta = seqs.get(fname).keys()
        records.extend([record for record in records_per_fasta])
        print fname

    anc = SeqIO.index(options.anc, "fasta")

    print "\n"

    records = set([str(r) for r in records])
    f_ab = open(options.output, "w")
    f_extra = open(options.extra, "w")
    bt_positions = BedToolPositions()
    for record in sorted(records):
        sequences = []
        # min_alignment_length = min([len(seqs.get(seq_key).get(record)) for seq_key in fname_list] +
        #                            [len(anc.get(record))])
        for seq_key in fname_list:
            # print seq_key
            sequences.append(seqs.get(seq_key).get(record))

        min_alignment_length = min([len(sequence) for sequence in sequences] +
                                   [len(anc.get(record))])

        per_chr_alignment = Align.MultipleSeqAlignment(
            [sequence[:min_alignment_length] for sequence in sequences])

        do_abbababa(per_chr_alignment, anc.get(record)[:min_alignment_length])

    bt_positions.write_to_BED(options.bed_out)
    f_ab.close()
    f_extra.close()

    return 1
def clean_alignment(alignment):
    """
    Remove ambiguities from alignment.

    Iterate over sites in the alignment and build a new alignment containing
    either only pure ATGC sites (-c) or sites with up to a specified proportion
    of N's (-c -n FLOAT).
    """
    site_length = len(alignment[:, 0])
    cleaned_alignment = Align.MultipleSeqAlignment(
        [seq[:0] for seq in alignment])

    if args.n_ratio:
        logging.info(f"Removing sites with > {int(args.n_ratio * 100)}% of " +
                     f"N's from '{alignment[0].name}'")
        for pos in range(0, len(alignment[0])):
            site = alignment[:, pos:pos + 1]
            site_nucleotides = alignment[:, pos]
            n_count = site_nucleotides.upper().count('N')
            n_ratio = n_count / site_length
            if n_ratio <= args.n_ratio:
                cleaned_alignment += site

    else:
        logging.info("Removing sites with ambiguities from " +
                     f"'{alignment[0].name}'")
        iupac = ['N', 'Y', 'R', 'K', 'M', 'W', 'S', 'B', 'D', 'H', 'V', '-']
        iupac_length = len(iupac)
        for pos in range(0, len(alignment[0])):
            site = alignment[:, pos:pos + 1]
            site_nucleotides = alignment[:, pos]
            bad_char = False
            if site_length > iupac_length:
                if any([char in site_nucleotides.upper() for char in iupac]):
                    bad_char = True
                    break
            else:
                for char in site:
                    if str(char.seq).upper() in 'NYRKMWSBDHV-':
                        bad_char = True
                        break
            if not bad_char:
                cleaned_alignment += site

    return cleaned_alignment