Example #1
0
def annotate_ref(msa_fobj, msa_informat, outfile_fobj, split, outformat):

    msa_data = AlignIO.read(msa_fobj, msa_informat)
    align1 = []
    align2 = []
    outfile_fobj2 = open(outfile_fobj.name + '.2', 'w')
    align1 = msa_data[:split]
    align2 = msa_data[split:]
    AlignIO.write(AlignIO.MultipleSeqAlignment(align1), outfile_fobj,
                  outformat)
    AlignIO.write(AlignIO.MultipleSeqAlignment(align2), outfile_fobj2,
                  outformat)
def simulate(q, lock):
    while True:
        sim_no, e = q.get()
        if e is None:
            break
        e(seqfile=None, ratefile=None, infofile=None)
        seq_dict = e.get_sequences()
        seq_tuple = sorted(seq_dict.items())
        align = AlignIO.MultipleSeqAlignment(
            SeqRecord(
                Seq(seqstr, generic_dna),
                id=tax,
                description='',
            ) for (tax, seqstr) in seq_tuple)
        out_align = os.path.join(
            out_dir, '%(pre)s_%(sim)d.fasta' % {
                "pre": args.prefix,
                "sim": sim_no
            })
        AlignIO.write(align, out_align, "fasta")
        lock.acquire()
        try:
            print "Process %d reporting: wrote alignment %d" % (os.getpid(),
                                                                sim_no)
        finally:
            lock.release()
        q.task_done()
Example #3
0
def split_alignment(alignment_file, proteins):
    '''
    in: path to fasta-format alignment file, [(gene, (start,end)), ...]
    out: separate .phyx-format (relaxed phylip) alignment files for each gene, based on provided coordinates
    NB: removes any sequences without > 70% non-gap sites from each segment alignment
    '''
    align = AlignIO.read(open(alignment_file, 'r'), 'fasta')
    ofile_stem = alignment_file.split('/')[-1].split('.')[
        0]  # name output like alignmentfilename_protein.phyx
    ofile_list = []
    for protein, (start, end) in proteins:
        ofile_name = ofile_stem + '_%s.phyx' % (protein)
        ofile_list.append(ofile_name)
        start, end = start - 1, end - 1  # adjust for pythonic coordinates
        align_segment = align[:, start:end +
                              1]  #[allrows, startcolumn:endcolumn] endcolumn += 1 for inclusive slicing
        filtered_align_segment = AlignIO.MultipleSeqAlignment([])
        for seq in align_segment:
            seq.seq.data = str(seq.seq).replace('n', '-').replace('N', '-')
            if float(str(seq.seq).count('-')) / float(
                    len(str(seq.seq))
            ) <= 0.30:  # Require at least 70% of sites are not gaps to include in segment alignment
                filtered_align_segment.append(seq)
        AlignIO.write(filtered_align_segment, ofile_name, 'phylip-relaxed')
    return ofile_list
def reduce_msa_to_seqs_by_name(msa, keep_names_lst):
    new_msa = []
    all_names = [rec.id for rec in list(msa)]
    for name in keep_names_lst:
        new_msa.append(msa[all_names.index(name), :])
    #remove positions that are just gaps after removal of sequences
    new_msa = remove_nonACTGU_sites(AlignIO.MultipleSeqAlignment(new_msa))
    return new_msa
Example #5
0
def run_muscle(fasta_file, out_file=None, muscle_params='', reorder=True):
    """
    beware, muscle does not keep sequence order and the --stable switch is broken
    :param fasta_file:
    :param out_file:
    :param muscle_params:
    :param reorder:
    :return:
    """
    ml.info('Running muscle.')
    ml.debug(fname())
    if out_file:
        cl_file = out_file
    else:
        cl_fd, cl_file = mkstemp(prefix='rba_',
                                 suffix='_07',
                                 dir=CONFIG.tmpdir)
        os.close(cl_fd)

    cmd = [
        '{}muscle'.format(CONFIG.muscle_path), '-clwstrict', '-seqtype', 'rna',
        '-out', cl_file, '-in', fasta_file, '-quiet'
    ]
    if muscle_params != '':
        cmd += [' '.join([shlex.quote(i) for i in shlex.split(muscle_params)])]
    ml.debug(cmd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = call(cmd, stdout=tmp, stderr=tmp)
        if r:
            msgfail = 'Call to muscle failed.'
            ml.error(msgfail)

            tmp.seek(0)
            raise exceptions.MuscleException(msgfail, tmp.read())

        if reorder:
            # reorder sequences acording to input file
            with open(fasta_file, 'r') as ff, open(cl_file, 'r+') as oo:
                orig_seqs = [i.id for i in SeqIO.parse(ff, format='fasta')]
                muscle_align = {
                    i.id: i
                    for i in AlignIO.read(oo, format='clustal')
                }

                # reorder
                reo_alig = []
                for s_name in orig_seqs:
                    # muscle cuts names
                    reo_alig.append(muscle_align[s_name[:32]])
                alig = AlignIO.MultipleSeqAlignment(reo_alig)
                # write
                oo.seek(0)
                AlignIO.write(alig, oo, format='clustal')
                oo.truncate()

        return cl_file
Example #6
0
 def test_first_sequence_in_is_first_sequence_out(self):
     alignment = AlignIO.MultipleSeqAlignment([
         SeqRecord(Seq("TTTT")),
         SeqRecord(Seq("AAAA")),
         SeqRecord(Seq("CC-C")),
     ])
     result = get_interval_seqs(alignment)
     expected = ["TTTT", "AAAA", "CCC"]
     self.assertEqual(expected, result)
Example #7
0
def catAln(alns):
    alphabet = alns[0][0, :].seq.alphabet
    catSeqs = list()
    for i in range(len(alns[0])):
        catSeq = list()
        for j in range(len(alns)):
            catSeq.append(str(alns[j][i].seq))
        catSeq = ''.join(catSeq)
        catSeqs.append(catSeq)
    result = AlignIO.MultipleSeqAlignment(
        SeqIO.SeqRecord(Seq(catSeqs[x], alphabet=alphabet),
                        id=alns[0][x, :].id) for x in range(0, len(catSeqs)))
    return (result)
Example #8
0
    def _try_rescue(profile_file):
        # beware AlignIO truncates sequence names so they become non-unique, then clustalo also fails
        ml.warning(
            'Trying rescue for profile alignment if profile has no gaps, sequences appears not aligned. '
            'Appending trailing gap to overcome the issue.')
        a = AlignIO.read(profile_file, format='clustal')
        s = [SeqRecord(Seq(str(i.seq) + '-'), id=i.id) for i in a]
        fa = AlignIO.MultipleSeqAlignment(s)

        fd, temp = mkstemp(prefix='rba_', suffix='_56', dir=CONFIG.tmpdir)
        with os.fdopen(fd, 'w') as fh:
            AlignIO.write(fa, fh, format='fasta')
        return temp
def run_raf(raf_params):
    (rna_file_path, raf_output_file_path, raf_output_file_path_2) = raf_params
    raf_command = "raf predict " + rna_file_path
    (output, _, _) = utils.run_command(raf_command)
    raf_output_file = open(raf_output_file_path, "w+")
    raf_output_file.write(output.decode())
    raf_output_file.close()
    sta = AlignIO.read(raf_output_file_path, "fasta")
    recs = sta[:-1]
    new_sta = AlignIO.MultipleSeqAlignment(recs)
    new_sta.column_annotations["secondary_structure"] = str(sta[-1].seq)
    AlignIO.write(new_sta, raf_output_file_path, "stockholm")
    AlignIO.write(new_sta, raf_output_file_path_2, "clustal")
Example #10
0
def trimaln(aln, target_ids, gaps=0.9):
    # Read the alignemnt into biopython structure
    aln = AlignIO.read(StringIO(aln), 'fasta')
    dfaln = pd.DataFrame(aln)
    dfaln.index = [x.id for x in aln]
    # get only the targets
    aln = AlignIO.MultipleSeqAlignment([x for x in aln if x.id in target_ids])
    nseqs = len(aln)
    c = pd.DataFrame(aln).apply(lambda x: sum(x == '-') / nseqs, axis=0)
    dfaln = dfaln.loc[:, c[c < gaps].index]
    e = dfaln.apply(lambda x: sum(x == '-') / dfaln.shape[1], axis=1)
    dfaln = dfaln[e < gaps].drop_duplicates()
    l = ['>%s\n%s\n' % (x[0], ''.join(x[1]).upper().strip()) for x in
         dfaln.iterrows()]
    return '\n'.join(l).replace('\n\n', '\n')
Example #11
0
def format_concatenated_alignment():
    logger = logging.getLogger(__name__)
    strain_names_map = build_strain_names_map()
    tree_alignment = AlignIO.read(
        open(os.path.join(DATA_DIR, "all_alignments"), "r"), FASTA_FILE_TYPE)
    tree_alignment_filtered = AlignIO.MultipleSeqAlignment([])
    for id, strain in zip(range(STRAINS_COUNT), tree_alignment):
        if all(c == '-' for c in strain.seq):
            logger.info("skipping filtered strain %d" % id)
        else:
            logger.info("adding id to strain %d" % id)
            strain.id = "[" + str(id) + "]" + strain_names_map[id]
            strain.description = ''
            tree_alignment_filtered.append(strain)
    AlignIO.write(tree_alignment_filtered,
                  open(os.path.join(DATA_DIR, "filtered_tree_alignment"), "w"),
                  FASTA_FILE_TYPE)
Example #12
0
    def visual_check(self):
        while True:
            vca = VisualAlleleCheck(suptitle=self.subdir.name, **self.__dict__)
            if self.seq_discarded:
                while self.seq_discarded:
                    vca.selected.append(self.seq_discarded.pop())
                vca.change_rect_color()
            vca.show()
            if len(vca.selected) > 0:
                ds = ", ".join(
                    [self.seqdat[i].id for i in sorted(vca.selected)])
                msg = "Would you like to discard {} sequence{} ({})?".format(
                    len(vca.selected), "s" if len(vca.selected) > 1 else "",
                    ds)
                if ask_user(msg, default="y", quit=True):
                    self.seqdat = [
                        self.seqdat[i] for i in range(len(self.seqdat))
                        if i not in vca.selected
                    ]
                    self.align = AlignIO.MultipleSeqAlignment([
                        self.align[i] for i in range(len(self.align))
                        if i not in vca.selected
                    ])
                    # remove gaps
                    self.align = remove_gap_pos(self.align)

                    self.ssr_regions, self.motifs = find_variable_ssrs(
                        self.align, **self.kwargs)

                    # add rep_data to seqdat
                    self.add_rep_data()

                    if len(self.align) > 1:
                        self.tree = construct_tree(self.align,
                                                   self.ssr_regions,
                                                   self.motifs)
                    else:
                        self.tree = None
                    print("Reconstructing phylogeny ...")
            else:
                msg = "Keep all sequences and write results?"
                if ask_user(msg, default="y", quit=True):
                    break
Example #13
0
def cut_alignment(input_fasta: str, output_file: str, begin: int, end: int) -> None:
    with open(input_fasta) as handle:
        alignment = AlignIO.read(handle, "fasta")

        length = alignment.get_alignment_length()
        new_length = end - begin

        if new_length < 0:
            raise RuntimeError("End position must be higher than the begin position")

        if begin + new_length > length:
            raise RuntimeError("Alignment is too short")

        new_records = []
        for record in alignment:
            new_records.append(record[begin:end])

        new_alignment = AlignIO.MultipleSeqAlignment(new_records)
        AlignIO.write(new_alignment, output_file, "fasta")
Example #14
0
def annotate_ref(msa_fobj, map_fobj, msa_informat, outfile_fobj,
                 msa_outformat):

    map_data = dict(
        [line.split("\t") for line in map_fobj.read().splitlines() if line])
    msa_data = AlignIO.read(msa_fobj, msa_informat)
    annotated_align = []
    for align in msa_data:
        seq_id = align.id
        try:
            genus, species = map_data[align.id].split(" ", 1)
            #annotation = ''.join([genus[0]+'.',species,"[{}]".format(seq_id)])
            #annotation = "{}[{}]".format(map_data[align.id],seq_id)
            annotation = "{} {}".format(genus, species)
            align.id = annotation
        except KeyError:
            pass
        annotated_align.append(align)

    AlignIO.write(AlignIO.MultipleSeqAlignment(annotated_align), outfile_fobj,
                  msa_outformat)
Example #15
0
def main(commandline_args):
    comm_args = create_and_parse_argument_options(commandline_args)
    alignment_file = read_align(comm_args.alignment_path)
    sliced_alignments = slice_by_name(alignment_file)
    first_aln = sorted(list(sliced_alignments.keys()))[0]
    slided_scores = {}  #Sliding increment -> (scores,alignment objects)
    for i in range(0, sliced_alignments[first_aln].get_alignment_length(),
                   comm_args.window):
        #print(i)
        second_aln = AlignIO.MultipleSeqAlignment([])
        for record in sliced_alignments[sorted(list(
                sliced_alignments.keys()))[1]]:
            second_aln.append(record)
        #Reorders an alignment group using the specified window size
        reordered_aln = sliced_alignments[first_aln][:, -(
            sliced_alignments[first_aln].get_alignment_length() -
            i):] + sliced_alignments[first_aln][:, :i]
        for record in reordered_aln:
            second_aln.append(record)
        alnindex_score, gapped_sliced_alns, number_of_aligned_positions, gp_mapping = TwinCons.main(
            ['-as',
             format(second_aln, "fasta"), '-r', '-mx', 'blosum62'])

        out_dict = {}
        for x in alnindex_score.keys():
            out_dict[x] = alnindex_score[x][0]
        slided_scores[i] = out_dict

    for file in slided_scores:
        print("Increment is " + str(file))
        alnindex = sorted(slided_scores[file].keys())
        posdata, negdata = uninterrupted_stretches(alnindex,
                                                   slided_scores[file],
                                                   comm_args)
        for x in sorted(posdata.keys()):
            print(x, posdata[x])
                unknown = False  # If alignment contains sequence with unknown amino acids
                proteins = []  # Proteins in filtered alignment
                species = set()  # Species in filtered alignment
                records = [
                ]  # Records to generate filtered alignment (for later re-alignment)

                # Iterate through records
                for i, record in enumerate(MSA):
                    if 'X' in record.seq.upper():
                        unknown = True
                    else:
                        proteins.append(record.name)
                        species.add(record.name.split('.')[0])
                        records.append(record)
                MSA = AlignIO.MultipleSeqAlignment(
                    records
                )  # Re-assign MSA to remove sequences with unknown amino acids

                if unknown and len(
                        MSA
                ) > 1:  # Re-align if unknown flag is True and more than sequence is present
                    args = [
                        'xvfb-run',
                        '/home/singlemd/miniconda3/envs/ete3/bin/ete3',
                        'build', '-w', 'eggnog41', '-a',
                        f'out/raw/{alignment_id}/unaligned.fa', '-o',
                        f'out/raw/{alignment_id}', '--dealign', '--cpu', '4'
                    ]
                    path_aligned = f'out/raw/{alignment_id}/metaligner_trimmed-trimal01-prottest_default-phyml_default/unaligned.fa.final_tree.fa'

                    # Create directory to store ete3 input and output
Example #17
0
 def test_ambiguous_bases_one_seq_with_repeated_base(self):
     alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RRAAT"))])
     result = get_interval_seqs(alignment)
     expected = {"GAAAT", "AAAAT", "GGAAT", "AGAAT"}
     self.assertEqual(set(result), expected)
Example #18
0
def main(args):
    # Load metadata and generate strain - gisaid id dictionary.
    md = pd.read_csv(args.metadata, sep="\t")
    print("Entries in metadata: {}".format(len(md)))
    md_gisaid_list = md["gisaid_epi_isl"].to_list()
    gisaid_dict = md.loc[~md["gisaid_epi_isl"].isna(),
                         ["strain", "gisaid_epi_isl"]]
    gisaid_dict = gisaid_dict.set_index("gisaid_epi_isl")
    gisaid_dict = gisaid_dict["strain"].to_dict()

    # Load alignment
    alignment = AlignIO.read(args.alignment, "fasta")
    print("Sequences in alignment: {}".format(len(alignment)))
    alignment_list = [i.name for i in alignment]

    # Load tree
    tree = Tree.get(path=args.tree, schema="newick")
    print("Leaves in tree: {}".format(len(tree.taxon_namespace)))

    # Determine leaves which names cannot be assigned.
    tree_leaves = [i.label for i in tree.taxon_namespace]
    tree_leaves = [i.replace(" ", "_") for i in tree_leaves]
    leaf_missing_md = np.setdiff1d(tree_leaves, md_gisaid_list)

    # Remove leaves identified
    print("Leaves in tree but not in metadata: {}".format(
        len(leaf_missing_md)))
    tree = tree.extract_tree_without_taxa_labels(
        [i.replace("_", " ") for i in leaf_missing_md])
    tree.purge_taxon_namespace()
    print("Leaves in tree after pruning: {}".format(len(tree.taxon_namespace)))

    # Rename leaves to match metadata and alignment
    print("Renaming leaves to match metadata and alignment... ", end="")
    leaves = list()
    for i in tree.taxon_namespace:
        try:
            i.label = gisaid_dict[i.label.replace(" ", "_")]
        except KeyError:
            pass
        leaves.append(i.label)
    print("Done")

    # Remove leaves that aren't in alignment
    leaf_missing_align = np.setdiff1d(leaves, alignment_list)
    print("Leaves in tree but not in alignment: {}".format(
        len(leaf_missing_align)))
    tree = tree.extract_tree_without_taxa_labels(leaf_missing_align)
    tree.purge_taxon_namespace()
    print("Leaves in tree after pruning: {}".format(len(tree.taxon_namespace)))

    # Update tree_leaves list
    tree_leaves = [i.label for i in tree.taxon_namespace]

    # Filter alignment to tips in tree
    tree_alignment = list()
    for i in alignment:
        if i.name in tree_leaves:
            tree_alignment.append(i)
    tree_alignment = AlignIO.MultipleSeqAlignment(tree_alignment)

    # Filter metadata to tips in tree
    tree_md = md.loc[md["strain"].isin([i.name for i in tree_alignment])]

    # Filter metadata and alignment to query
    query_md = md.loc[md["interest"] == "interest"]
    interests = query_md["strain"].to_list()
    query_alignment = [i for i in alignment if i.name in interests]
    query_alignment = AlignIO.MultipleSeqAlignment(query_alignment)

    # Write files to disk
    tree.write(path=os.path.join(args.outdir, "global.tree"), schema="newick")
    AlignIO.write(tree_alignment, os.path.join(args.outdir, "alignment.fasta"),
                  "fasta")
    tree_md.to_csv(os.path.join(args.outdir, "metadata.csv"), index=False)
    AlignIO.write(query_alignment, os.path.join(args.outdir, "query.fasta"),
                  "fasta")
    query_md.to_csv(os.path.join(args.outdir, "query.csv"), index=False)
Example #19
0
parser.add_argument('-o','--outfile', type=argparse.FileType('w'), help ="MSA output file",  required=True)
parser.add_argument('-v','--outformat', default="clustal", help ="MSA output format")
parser.add_argument('-l','--list', type=argparse.FileType('r'),  required = True)
args = parser.parse_args()

sel_seqs = args.list.read().split()
msa_data = AlignIO.read(args.msa_file, args.informat)

sel_align = []
found_align = []
for align in msa_data:
    if align.id in sel_seqs:
        sel_align.append(align)
        found_align.append(align.id)

AlignIO.write(AlignIO.MultipleSeqAlignment(sel_align), args.outfile, args.outformat)
[ sys.write("Not found:\t{}".format(align.id)) for seq_id in found_align if seq_id not in found_align  ]     
    

    






        
    

        
Example #20
0
         i = np.random.choice(range(len(current_species)))
     a = current_species[i]
     a.gene_duplication()
     a.speciation(
     )  # event of speciation in which two new species diverge from the previous one.
     current_species = [
         x for x in sp_tree.nodes() if sp_tree.out_degree(x) == 0
     ]  # updates the "leaves" in species tree.
     leaves = [x for x in seq_tree.nodes() if seq_tree.out_degree(x) == 0
               ]  # updates the "leaves" in sequences tree.
 if len(orthologs) < args.n_ort:
     print('Warning: few sequences to have %d ortholog groups!' %
           args.n_ort)
 ###====================================================================================================
 colection = AlignIO.MultipleSeqAlignment([
     SeqRecord(Seq(seq.sequence), id=str(seq))
     for seq in sequences.colection
 ])
 AlignIO.write(
     colection,
     open('%s_all_sequences.%s' % (args.out, args.msa_format), 'w'),
     args.msa_format)
 alignment = AlignIO.MultipleSeqAlignment(build_MSA(seq_tree, first_seq))
 AlignIO.write(
     alignment,
     open('%s_current_sequences.%s' % (args.out, args.msa_format), 'w'),
     args.msa_format)
 tree = Phylo.BaseTree.Tree(root=build_tree(seq_tree, first_seq),
                            rooted=True)
 Phylo.write(tree, '%s_gene_tree.%s' % (args.out, args.tree_format),
             args.tree_format)
 cladogram = Phylo.BaseTree.Tree(root=build_tree(sp_tree, first_sp,
Example #21
0
def divergence(fastain, patient_id, cutoff):
    # fasta = open('%s' % filename, 'r')

    split_fasta = split(fastain, 1)
    seqs_by_timepoint = split_fasta[0]
    total_seq = split_fasta[1]

    # conseq = consensus.seq[(sites_pos[0]-1):(sites_pos[1]-1)]
    # conseq = Seq(str(consensus).replace('-','N'))
    # consensus = Seq(conseq.seq.tostring().replace('-','N'))

    # seq_length = len(consensus)
    mean_divergence = []
    median_divergence = []

    lower_divergence_25 = []
    upper_divergence_75 = []
    lower_divergence_5 = []
    upper_divergence_95 = []
    divergence_std = []
    mean_N_divergence = []
    median_N_divergence = []

    lower_N_divergence_25 = []
    upper_N_divergence_75 = []
    lower_N_divergence_5 = []
    upper_N_divergence_95 = []
    N_divergence_std = []
    mean_S_divergence = []
    median_S_divergence = []
    lower_S_divergence_25 = []
    upper_S_divergence_75 = []
    lower_S_divergence_5 = []
    upper_S_divergence_95 = []
    S_divergence_std = []
    dN = []
    dN_med = []
    dN_lower_25 = []
    dN_upper_75 = []
    dN_lower_5 = []
    dN_upper_95 = []
    dN_std = []
    dS = []
    dS_med = []
    dS_lower_25 = []
    dS_upper_75 = []
    dS_lower_5 = []
    dS_upper_95 = []
    dS_std = []
    patient = []

    # parts = str.split(fastain, "/")
    # parts2 = str.split(parts[len(parts)-1], "_")

    patient.append(patient_id)

    nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None)

    sorted_timepoints = seqs_by_timepoint.keys()
    sorted_timepoints.sort(key=natural_keys)

    print sorted_timepoints
    first_timepoint = AlignIO.MultipleSeqAlignment(
        seqs_by_timepoint[sorted_timepoints[0]])

    consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus(
        threshold=0.01).upper()
    conseq = Seq(str(consensus).replace('X', 'N'))

    prot = ""
    if "gag" in fastain:
        prot = "gag"
    else:
        prot = "gp41"

    sampleTimes = []
    for t in sorted_timepoints:
        sampleTimes.append(float(t))

    # for f in filelist:
    for t in range(0, len(sorted_timepoints)):

        divergence = []
        divergence_N = []
        divergence_S = []
        divergence_dN = []
        divergence_dS = []
        # diff = 0

        seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]]

        seq_length = len(seqs_at_t[0].seq)

        seq_freq = get_seq_freq(seqs_at_t)

        seqs_at_t_array = np.asarray(seqs_at_t)

        # i want to calculate derived freq wrt to consequence not minor freq per site
        #for c in xrange(0,len(consensus_seqs)):

        full_der_freq = []

        total_site_freq = []

        for i in range(seq_length):

            site_a = seqs_at_t_array[:, i]

            anc_freq = 0
            der_freq = 0

            #gap_count = "".join(site_a).count('-')

            for j in range(0, len(seq_freq)):

                if site_a[j] != '-':
                    if conseq[i].lower() == site_a[j]:
                        anc_freq += seq_freq[j]
                    else:
                        der_freq += seq_freq[j]

                # if (site_a[j] == 'a'):
                #     A += seq_freq[j]
                # elif (site_a[j] == 'c'):
                #     C += seq_freq[j]
                # elif (site_a[j] == 't'):
                #     T += seq_freq[j]
                # elif (site_a[j] == 'g'):
                #     G += seq_freq[j]

            total_seq = sum([der_freq, anc_freq])

            full_der_freq.append(der_freq)

            total_site_freq.append(total_seq)

            #print [der_freq, anc_freq], total_seq
            #total_site_freq_per_consensus.append(total_site_freq)
            #full_der_freq_per_consensus.append(full_der_freq)

        #for c in xrange(0, len(consensus_seqs)):
        for i in range(seq_length):

            # print i, full_der_freq[i], patient_id, sorted_timepoints[t], total_seq, float(
            #     full_der_freq[i]) / float(total_seq)
            diff = 0
            diff_N = 0
            diff_S = 0
            count = total_site_freq[i]
            count1 = 0
            if full_der_freq[i] > cutoff * total_seq:

                for each in seqs_at_t:

                    parts = str.split(each.name, "_")
                    freq = int(parts[2].strip())

                    seq = Seq(str(each.seq).upper().replace('-', 'N'))

                    if (str(conseq[i]) != "N"):

                        if (str(seq[i]) != "N"):

                            count1 += freq

                            if (conseq[i] != seq[i]):

                                codon = []

                                if (i % 3 == 0):
                                    cp = i
                                    cp_a = i + 1
                                    cp_b = i + 2

                                    codon = [cp, cp_a, cp_b]

                                elif (i % 3 == 1):
                                    cp_a = i - 1
                                    cp = i
                                    cp_b = i + 1

                                    codon = [cp_a, cp, cp_b]

                                else:

                                    cp_a = i - 2
                                    cp_b = i - 1
                                    cp = i

                                    codon = [cp_a, cp_b, cp]

                                consensus_aa = conseq[codon[0]:(
                                    codon[2] + 1)].translate()
                                current_aa = seq[codon[0]:(codon[2] +
                                                           1)].translate()

                                # print(str(consensus_aa), str(current_aa))
                                if 'X' in conseq[codon[0]:(codon[2] + 1)]:
                                    break

                                if (str(consensus_aa) != str(current_aa)):

                                    diff_N += freq
                                else:
                                    diff_S += freq

                                #print i, current_aa, consensus_aa, diff_N, diff_S, each.name, freq
                                diff += freq

                        #print each.name, sorted_timepoints[t], "d", float(diff), i, seq_length, count

            print(count, count1, i, diff, diff_N, diff_S)
            #
            # if((count-count1) != 0):
            #     print(count, count1, i, diff, diff_N, diff_S)

            if count > 0:

                #print i, patient_id, diff, count
                divergence.extend([float(diff) / float(count)])
                divergence_N.extend([float(diff_N) / float(count)])
                divergence_S.extend([float(diff_S) / float(count)])
                divergence_dN.extend(
                    [float(diff_N) / float(nonsyn_sites) / float(count)])
                divergence_dS.extend(
                    [float(diff_S) / float(syn_sites) / float(count)])

        if len(divergence) > 1:
            mean_divergence.append(np.mean(divergence))
            median_divergence.append(np.percentile(divergence, 50))
            lower_divergence_25.append(np.percentile(divergence, 25))
            upper_divergence_75.append(np.percentile(divergence, 75))
            lower_divergence_5.append(np.percentile(divergence, 5))
            upper_divergence_95.append(np.percentile(divergence, 95))
            divergence_std.append(np.std(divergence))

            mean_N_divergence.append(np.mean(divergence_N))
            median_N_divergence.append(np.percentile(divergence_N, 50))
            lower_N_divergence_25.append(np.percentile(divergence_N, 25))
            upper_N_divergence_75.append(np.percentile(divergence_N, 75))
            lower_N_divergence_5.append(np.percentile(divergence_N, 5))
            upper_N_divergence_95.append(np.percentile(divergence_N, 95))
            N_divergence_std.append(np.std(divergence_N))

            mean_S_divergence.append(np.mean(divergence_S))
            median_S_divergence.append(np.percentile(divergence_S, 50))
            lower_S_divergence_25.append(np.percentile(divergence_S, 25))
            upper_S_divergence_75.append(np.percentile(divergence_S, 75))
            lower_S_divergence_5.append(np.percentile(divergence_S, 5))
            upper_S_divergence_95.append(np.percentile(divergence_S, 95))
            S_divergence_std.append(np.std(divergence_S))

            dN.append(np.mean(divergence_dN))
            dN_med.append(np.percentile(divergence_dN, 50))
            dN_lower_25.append(np.percentile(divergence_dN, 25))
            dN_upper_75.append(np.percentile(divergence_dN, 75))
            dN_lower_5.append(np.percentile(divergence_dN, 5))
            dN_upper_95.append(np.percentile(divergence_dN, 95))
            dN_std.append(np.std(divergence_dN))

            dS.append(np.mean(divergence_dS))
            dS_med.append(np.percentile(divergence_dS, 50))
            dS_lower_25.append(np.percentile(divergence_dS, 25))
            dS_upper_75.append(np.percentile(divergence_dS, 75))
            dS_lower_5.append(np.percentile(divergence_dS, 5))
            dS_upper_95.append(np.percentile(divergence_dS, 95))
            dS_std.append(np.std(divergence_dS))

            if ("gag" in fastain):
                csvfile_gag_b.write(patient_id + "," +
                                    str(sorted_timepoints[t]) + "," +
                                    str(np.mean(divergence)) + "," +
                                    str(np.percentile(divergence, 50)) + "," +
                                    str(np.percentile(divergence, 5)) + "," +
                                    str(np.percentile(divergence, 95)) + "," +
                                    str(np.mean(divergence_N)) + "," +
                                    str(np.percentile(divergence_N, 50)) +
                                    "," + str(np.percentile(divergence_N, 5)) +
                                    "," +
                                    str(np.percentile(divergence_N, 95)) +
                                    "," + str(np.mean(divergence_S)) + "," +
                                    str(np.percentile(divergence_S, 50)) +
                                    "," + str(np.percentile(divergence_S, 5)) +
                                    "," +
                                    str(np.percentile(divergence_S, 95)) +
                                    "\n")

                csvfile_gag_b.flush()

            elif ("gp41" in fastain):
                csvfile_gp41_b.write(
                    patient_id + "," + str(sorted_timepoints[t]) + "," +
                    str(np.mean(divergence)) + "," +
                    str(np.percentile(divergence, 50)) + "," +
                    str(np.percentile(divergence, 5)) + "," +
                    str(np.percentile(divergence, 95)) + "," +
                    str(np.mean(divergence_N)) + "," +
                    str(np.percentile(divergence_N, 50)) + "," +
                    str(np.percentile(divergence_N, 5)) + "," +
                    str(np.percentile(divergence_N, 95)) + "," +
                    str(np.mean(divergence_S)) + "," +
                    str(np.percentile(divergence_S, 50)) + "," +
                    str(np.percentile(divergence_S, 5)) + "," +
                    str(np.percentile(divergence_S, 95)) + "\n")

        else:
            print "xxx", patient_id, sorted_timepoints[t]

        print patient_id, sorted_timepoints[t], len(divergence)
Example #22
0
def divergence(fastain, translate, date_part, patient_id, sites):

    seqs_by_timepoint = split(fastain, date_part)

    mean_divergence = []
    median_divergence = []

    lower_divergence_25 = []
    upper_divergence_75 = []
    lower_divergence_5 = []
    upper_divergence_95 = []
    divergence_std = []
    mean_N_divergence = []
    median_N_divergence = []

    lower_N_divergence_25 = []
    upper_N_divergence_75 = []
    lower_N_divergence_5 = []
    upper_N_divergence_95 = []
    N_divergence_std = []
    mean_S_divergence = []
    median_S_divergence = []
    lower_S_divergence_25 = []
    upper_S_divergence_75 = []
    lower_S_divergence_5 = []
    upper_S_divergence_95 = []
    S_divergence_std = []
    dN = []
    dN_med = []
    dN_lower_25 = []
    dN_upper_75 = []
    dN_lower_5 = []
    dN_upper_95 = []
    dN_std = []
    dS = []
    dS_med = []
    dS_lower_25 = []
    dS_upper_75 = []
    dS_lower_5 = []
    dS_upper_95 = []
    dS_std = []
    patient = []

    # parts = str.split(fastain, "/")
    # parts2 = str.split(parts[len(parts)-1], "_")

    patient.append(patient_id)

    nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None)
    print nonsyn_sites, syn_sites

    sorted_timepoints = seqs_by_timepoint.keys()
    sorted_timepoints.sort(key=natural_keys)

    print sorted_timepoints
    first_timepoint = AlignIO.MultipleSeqAlignment(
        seqs_by_timepoint[sorted_timepoints[0]])

    consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus(
        threshold=0.01).upper()

    sampleTimes = []
    for t in sorted_timepoints:
        sampleTimes.append(float(t))

    #for f in filelist:
    for t in range(0, len(sorted_timepoints)):

        divergence = []
        divergence_N = []
        divergence_S = []
        divergence_dN = []
        divergence_dS = []
        # diff = 0

        seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]]

        for each in seqs_at_t:

            parts = str.split(each.name, "_")
            freq = 1
            diff = 0
            diff_N = 0
            diff_S = 0

            seq = Seq(str(each.seq).upper().replace('-',
                                                    'N'))[sites[0]:sites[1]]

            codon_pos_start = 0
            codon_pos_end = 2

            A_i = str(seq).find('A')
            T_i = str(seq).find('T')
            G_i = str(seq).find('G')
            C_i = str(seq).find('C')

            start = [A_i, T_i, G_i, C_i]

            A_ii = str(seq).rfind('A')
            T_ii = str(seq).rfind('T')
            G_ii = str(seq).rfind('G')
            C_ii = str(seq).rfind('C')

            end = [A_ii, T_ii, G_ii, C_ii]

            start_i = min(start)
            end_i = max(end)

            if start_i > -1 and end_i > -1:
                # print start_i, end_i

                remainder_1 = start_i % 3
                remainder_2 = end_i % 3

                if remainder_1 != 0:
                    b = remainder_1 != codon_pos_start
                    # print start_i, start_i + (3-remainder_1)
                    start_i = start_i + (3 - remainder_1)

                if remainder_2 != 2:
                    # tprint end_i, end_i + (3-remainder_2)
                    end_i = end_i + (2 - remainder_2)

                seq = seq[start_i:end_i + 1]
                gaps = str(seq).count('N')

                seq_length = len(seq)
                aa_length = seq_length / 3

                conseq = Seq(str(consensus).replace('X',
                                                    'N'))[sites[0]:sites[1]]

                translated_seq = seq.translate()

                gaps_con = str(conseq).count('N')

                if gaps_con == seq_length:

                    print("all gaps in conseq")
                    break

                else:
                    # if (seq_length >= length and (float(gaps) / float(seq_length)) < 0.05 and
                    #             (float(gaps_con) / float(seq_length)) < 0.05):

                    # print translated_seq, conseq.translate(),
                    count = 0
                    if (translate):
                        seq = each.seq.translate()

                    # count +=1

                    for a in range(seq_length):

                        i = a

                        if (str(conseq[i]) != "N"):

                            if (str(seq[i]) != "N"):

                                count = count + 1

                                if (conseq[i] != seq[i]):

                                    codon = []

                                    if (i % 3 == 0):
                                        cp = i
                                        cp_a = i + 1
                                        cp_b = i + 2

                                        codon = [cp, cp_a, cp_b]

                                    elif (i % 3 == 1):
                                        cp_a = i - 1
                                        cp = i
                                        cp_b = i + 1

                                        codon = [cp_a, cp, cp_b]

                                    else:

                                        cp_a = i - 2
                                        cp_b = i - 1
                                        cp = i

                                        codon = [cp_a, cp_b, cp]

                                    consensus_aa = conseq[codon[0]:(
                                        codon[2] + 1)].translate()
                                    current_aa = seq[codon[0]:(codon[2] +
                                                               1)].translate()

                                    #print(str(consensus_aa), str(current_aa))
                                    if 'X' in conseq[codon[0]:(codon[2] + 1)]:
                                        break

                                    if (str(consensus_aa) != str(current_aa)):

                                        diff_N += 1
                                    else:
                                        diff_S += 1

                                    diff += 1

                    # print diff/count, diff/seq_length

                    divergence.extend([float(diff) / float(count)] * freq)
                    divergence_N.extend([float(diff_N) / float(count)] * freq)
                    divergence_S.extend([float(diff_S) / float(count)] * freq)
                    divergence_dN.extend(
                        [float(diff_N) / float(nonsyn_sites)] * freq)
                    divergence_dS.extend([float(diff_S) / float(syn_sites)] *
                                         freq)

        if len(divergence) < 100:

            mean_divergence.append(float('nan'))
            median_divergence.append(float('nan'))
            lower_divergence_25.append(float('nan'))
            upper_divergence_75.append(float('nan'))
            lower_divergence_5.append(float('nan'))
            upper_divergence_95.append(float('nan'))
            divergence_std.append(float(1000))

            mean_N_divergence.append(float('nan'))
            median_N_divergence.append(float('nan'))
            lower_N_divergence_25.append(float('nan'))
            upper_N_divergence_75.append(float('nan'))
            lower_N_divergence_5.append(float('nan'))
            upper_N_divergence_95.append(float('nan'))
            N_divergence_std.append(float(1000))

            mean_S_divergence.append(float('nan'))
            median_S_divergence.append(float('nan'))
            lower_S_divergence_25.append(float('nan'))
            upper_S_divergence_75.append(float('nan'))
            lower_S_divergence_5.append(float('nan'))
            upper_S_divergence_95.append(float('nan'))
            S_divergence_std.append(float(1000))

            dN.append(float('nan'))
            dN_med.append(float('nan'))
            dN_lower_25.append(float('nan'))
            dN_upper_75.append(float('nan'))
            dN_lower_5.append(float('nan'))
            dN_upper_95.append(float('nan'))
            dN_std.append(float('nan'))

            dS.append(float('nan'))
            dS_med.append(float('nan'))
            dS_lower_25.append(float('nan'))
            dS_upper_75.append(float('nan'))
            dS_lower_5.append(float('nan'))
            dS_upper_95.append(float('nan'))
            dS_std.append(float(1000))

        else:

            #print divergence
            mean_divergence.append(np.mean(divergence))
            median_divergence.append(np.percentile(divergence, 50))
            lower_divergence_25.append(np.percentile(divergence, 25))
            upper_divergence_75.append(np.percentile(divergence, 75))
            lower_divergence_5.append(np.percentile(divergence, 5))
            upper_divergence_95.append(np.percentile(divergence, 95))
            divergence_std.append(np.std(divergence))

            mean_N_divergence.append(np.mean(divergence_N))
            median_N_divergence.append(np.percentile(divergence_N, 50))
            lower_N_divergence_25.append(np.percentile(divergence_N, 25))
            upper_N_divergence_75.append(np.percentile(divergence_N, 75))
            lower_N_divergence_5.append(np.percentile(divergence_N, 5))
            upper_N_divergence_95.append(np.percentile(divergence_N, 95))
            N_divergence_std.append(np.std(divergence_N))

            mean_S_divergence.append(np.mean(divergence_S))
            median_S_divergence.append(np.percentile(divergence_S, 50))
            lower_S_divergence_25.append(np.percentile(divergence_S, 25))
            upper_S_divergence_75.append(np.percentile(divergence_S, 75))
            lower_S_divergence_5.append(np.percentile(divergence_S, 5))
            upper_S_divergence_95.append(np.percentile(divergence_S, 95))
            S_divergence_std.append(np.std(divergence_S))

            dN.append(np.mean(divergence_dN))
            dN_med.append(np.percentile(divergence_dN, 50))
            dN_lower_25.append(np.percentile(divergence_dN, 25))
            dN_upper_75.append(np.percentile(divergence_dN, 75))
            dN_lower_5.append(np.percentile(divergence_dN, 5))
            dN_upper_95.append(np.percentile(divergence_dN, 95))
            dN_std.append(np.std(divergence_dN))

            dS.append(np.mean(divergence_dS))
            dS_med.append(np.percentile(divergence_dS, 50))
            dS_lower_25.append(np.percentile(divergence_dS, 25))
            dS_upper_75.append(np.percentile(divergence_dS, 75))
            dS_lower_5.append(np.percentile(divergence_dS, 5))
            dS_upper_95.append(np.percentile(divergence_dS, 95))
            dS_std.append(np.std(divergence_dS))

    df = pd.DataFrame.from_items([
        ('Times', sampleTimes), ('Divergence_median', median_divergence),
        ('Divergence_mean', mean_divergence),
        ('Divergence_N_med', median_N_divergence),
        ('Divergence_N_mean', mean_N_divergence),
        ('Divergence_S_med', median_S_divergence),
        ('Divergence_S_mean', mean_S_divergence),
        ('Divergence_lower_25', lower_divergence_25),
        ('Divergence_upper_75', upper_divergence_75),
        ('Divergence_lower_5', lower_divergence_5),
        ('Divergence_upper_95', upper_divergence_95),
        ("Divergence_std", divergence_std),
        ('Divergence_N_lower_25', lower_N_divergence_25),
        ('Divergence_N_upper_75', upper_N_divergence_75),
        ('Divergence_N_lower_5', lower_N_divergence_5),
        ('Divergence_N_upper_95', upper_N_divergence_95),
        ("Divergence_N_std", N_divergence_std),
        ('Divergence_S_lower_25', lower_S_divergence_25),
        ('Divergence_S_upper_75', upper_S_divergence_75),
        ('Divergence_S_lower_5', lower_S_divergence_5),
        ('Divergence_S_upper_95', upper_S_divergence_95),
        ("Divergence_S_std", S_divergence_std),
        ('Patients', patient * len(sampleTimes))
    ])

    # csvfilename = filename.replace("filelist_", "divergence_results_by_birth_patristic_sites_"+str(sites_pos[0])+"_to_"+str(sites_pos[1])+"_")
    # csvfilename = csvfilename.replace(".txt",".csv")
    # df.to_csv(csvfilename)

    df = df.dropna()

    # print sampleTimes - (np.ones(len(sampleTimes))*sampleTimes[0])

    # print mean_divergence
    # print divergence_std

    if len(df) == 1:

        total_div = [
            'total',
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan')
        ]
        N_div = [
            'N',
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan')
        ]
        S_div = [
            'S',
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan'),
            float('nan')
        ]

    return df
Example #23
0
if __name__ == '__main__':
    # DIRS
    input_dir = '2_alignments'
    output_dir = '3_supermatrix'
    # MATCH IDS INTO SINGLE DICTIONARY
    print part_names
    seqdict, part_names = getSeqDict(parts, part_names)
    # CONSTRUCT SUPERMATRIX
    supermatrix, ngaps = getSupermatrix(seqdict, parts, part_names)
    print len(supermatrix)
    print ngaps
    ngaps_psp = float(ngaps) / len(supermatrix)
    # GET PARTITIONS
    partition_text = getPartitions(parts, part_names)
    # OUTPUT
    alignment = AlignIO.MultipleSeqAlignment(supermatrix)
    print(
        'Supermatix of [{0}] length and [{1}] species generated with [{2}] \
    gaps per species'.format(alignment.get_alignment_length(), len(alignment),
                             ngaps_psp))
    outfile = os.path.join(input_dir, 'supermatrix.phy')
    with open(outfile, "w") as f:
        # write out using PhylipWriter in order to extend id_width
        AlignIO.PhylipIO.PhylipWriter(f).write_alignment(alignment,
                                                         id_width=100)
    # OUTPUT PARITIONS
    if partition_text:
        outfile = os.path.join(input_dir, 'partitions.txt')
        with open(outfile, 'w') as file:
            file.write(partition_text)
Example #24
0
def plot_pairwise_diff(fastain, window_size):

    AA = [
        'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F',
        'P', 'S', 'T', 'W', 'Y', 'V'
    ]
    print(len(AA))
    aln = AlignIO.read('%s' % fastain, 'fasta')

    trans_aln = []

    for i in range(len(aln)):

        trans_aln.append(
            SeqIO.SeqRecord(
                Seq(str(aln[i].seq).replace('-', 'N')).translate()))

    trans_aln = AlignIO.MultipleSeqAlignment(trans_aln)

    seq_length = len(aln[1, :])

    n_windows = seq_length / window_size

    midpoint = []
    pwd = []
    raw_diff = []
    window_no = []

    count = 1
    end = window_size
    sliding_window_size = 100

    for each in range(n_windows):

        start = each * window_size
        end = (each + 1) * window_size

        print start, end, each + 1
        sub_aln = trans_aln[:, each]

        aa_freq = []

        for a in AA:

            align_array = np.array(sub_aln, np.str)
            aa_freq.append(str(align_array).count(a))

        total_aa_freq = sum(aa_freq)

        diff = 0.0
        for i in range(0, len(aa_freq)):

            freq_i = float(aa_freq[i]) / float(total_aa_freq)
            print(AA[i], freq_i)
            for j in range((i + 1), len(aa_freq)):

                freq_j = float(aa_freq[j]) / float(total_aa_freq - 1)

                diff += freq_i * freq_j

        #print diff

        pwd.append(diff)
        window_no.append(each + 1)

    return {"codon_no": window_no, "pwd": pwd}
Example #25
0
def cut_fasta(input_file):
    '''
    Input - name of file with alignment in fasta-format
    Finds positions without gaps in reference sequence (the first in alignment)
    Removes the columns with gaps in reference sequence from alignment
    Saves new alignment to the file 'input_file_name_cut.fasta'
    '''

    alignment = AlignIO.read(open(input_file), "fasta")  # alignment object
    temp_seq = alignment[0].seq  # template - reference sequence

    positions = []  # list of positions without gaps in reference sequence

    pos_st = 0  # start position of block without gaps in reference seq
    pos_end = 0  # end position of block without gaps in reference seq

    count = 0  #legnth of block
    prev = ''  #previous nucleotide

    #if k==1, we found block of gaps
    k = 0

    #searching blocks without gaps
    for nuc in temp_seq:
        if nuc == '-' and prev != '-':
            pos_end = count
            if pos_end != 0:
                positions.append([pos_st, pos_end])
                k = 1
        if nuc != '-' and prev == '-':
            pos_st = count
            k = 0
        count += 1
        prev = nuc
    if k == 0:
        positions.append([pos_st, len(temp_seq)])
    print(positions)

    # if no gaps in reference seq
    if len(positions) == 0:
        alignment1 = alignment
    # cutting regions without gaps
    else:
        # alignment1 = alignment[:,positions[0][0]:(positions[len(positions)-1][1])]
        alignment1 = alignment[:, positions[0][0]:(positions[0][1])]
        for i in range(1, len(positions)):
            alignment1 = alignment1 + alignment[:, positions[i][0]:
                                                (positions[i][1])]

    # If more than 10% of sequence are gaps, the sequence is deleted
    alignment_l = []
    for rec in alignment1:
        count_gap = rec.seq.count('-')

        if count_gap / len(rec.seq) < 0.10:
            alignment_l.append(rec)

    alignment_new = AlignIO.MultipleSeqAlignment(alignment_l)

    print('Number of sequences in alignment {}'.format(len(alignment_new)))
    # print( "Alignment length {0}".format(alignment_new.get_alignment_length()))

    out_file = str(os.path.splitext(input_file)[0] + '_cut.fasta')
    # out_file = '.'.join(input_file.split('.')[:-1]) + '_cut.fasta'
    AlignIO.write(alignment_new, open(out_file, 'w'), "fasta")
Example #26
0
# Iterate through all seqs in the alignment, splitting contigs where necessary.
SeqsForOutput = []
for seq in AlignedSeqs:
    if seq.id in ContigsFound:
        if args.trim_overhangs:
            TrimmedSeq = "-" * FirstRefStart + str(seq.seq)[FirstRefStart:LastRefEnd \
            + 1] + "-" * (AlignmentLength - LastRefEnd - 1)
            assert len(TrimmedSeq) == AlignmentLength, \
            "Internal malfunction of overhang trimming"
            seq.seq = Seq(TrimmedSeq)
        SeqsForOutput += list(
            split_parts(seq, args.min_contig_size, args.split_gap_size))
    else:
        SeqsForOutput.append(seq)

# It's possible that after splitting contigs and imposing a minimum length
# threshold, there are no contigs left. Exit with status 3 - shiver's reserved
# non-zero exit status to indicate a lack of HIV data.
NumContigs = len(SeqsForOutput) - NumRefSeqs
if NumContigs == 0:
    print("After splitting contigs at gaps of length at least",
    args.split_gap_size, "and discarding contigs of length less than " + \
    str(args.min_contig_size) + ", no contigs were left. Quitting.",
    file=sys.stderr)
    exit(3)

# Remove pure-gap columns and print the output.
OutputAlignment = AlignIO.MultipleSeqAlignment(SeqsForOutput)
OutputAlignment = RemoveBlankColumns(OutputAlignment)
AlignIO.write(OutputAlignment, sys.stdout, 'fasta')
Example #27
0
                    i += 1
            break

    positions = list()
    if os.path.exists(args.pos):
        with open(args.pos, 'r') as pos_file:
            positions = [
                return_range(pos)
                for pos in pos_file.readline().strip().split(',')
            ]
    else:
        positions = [return_range(pos) for pos in args.pos.split(',')]

    # flatten list
    positions = [pos for element in positions for pos in element]

    # prepare alignment positions
    aln_pos = [ref_pos.get(rpos, None) for rpos in positions]

    # extracted records
    e_records = []

    for record in aln:
        seq = ''.join([record.seq[apos] for apos in aln_pos])
        e_records.append(SeqRecord(Seq(seq), id=record.id, description=''))

    aln_extract = AlignIO.MultipleSeqAlignment(e_records)

    with open(args.out, 'w') as out_file:
        AlignIO.write(aln_extract, out_file, args.out_fmt)
Example #28
0
 def test_ambiguous_bases_one_seq(self):
     alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RWAAT"))])
     result = get_expanded_sequences(alignment)
     expected = {"GAAAT", "AAAAT", "GTAAT", "ATAAT"}
     self.assertEqual(set(result), expected)