コード例 #1
0
def records_from_data_files(filepaths=None, folder=None):
    """Automatically convert files or a folder's content to Biopython records.
    """
    if folder is not None:
        filepaths = [f._path for f in flametree.file_tree(folder)._all_files]
    records = []
    for filepath in filepaths:
        filename = os.path.basename(filepath)
        if filename.lower().endswith("zip"):
            records += records_from_zip_file(filepath)
            continue
        recs, fmt = records_from_file(filepath)
        single_record = len(recs) == 1
        for i, record in enumerate(recs):
            name_no_extension = "".join(filename.split(".")[:-1])
            name = name_no_extension + ("" if single_record else ("%04d" % i))
            name = name.replace(" ", "_")
            UNKNOWN_IDS = [
                "None",
                "",
                "<unknown id>",
                ".",
                "EXPORTED",
                "<unknown name>",
                "Exported",
            ]
            if has_dna_alphabet:  # Biopython <1.78
                record.seq.alphabet = DNAAlphabet()
            record.annotations["molecule_type"] = "DNA"

            # Sorry for this parts, it took a lot of "whatever works".
            # keep your part names under 20c and pointless, and everything
            # will be good
            if str(record.id).strip() in UNKNOWN_IDS:
                record.id = name
            if str(record.name).strip() in UNKNOWN_IDS:
                record.name = name
            record.file_name = name_no_extension
        records += recs
    return records
コード例 #2
0
    def to_record(self, record=None, record_id=None):
        """Return a Biopython seqrecord of the quote.

        >>> record = to_record(solution)
        >>> # Let's plot with DnaVu:
        >>> from dnavu import create_record_plot
        >>> from bokeh.io import output_file, show
        >>> output_file("view.html")
        >>> plot = create_record_plot(record)
        >>> show(plot)
        """
        if record_id is None:
            record_id = self.id
        if record is None:
            if has_dna_alphabet:  # Biopython <1.78
                record = SeqRecord(Seq(self.sequence, DNAAlphabet()),
                                   id=record_id)
            else:
                record = SeqRecord(Seq(self.sequence), id=record_id)
            record.annotations["molecule_type"] = "DNA"

        else:
            record = deepcopy(record)

        if self.plan is not None:
            features = [
                SeqFeature(
                    FeatureLocation(q.segment_start, q.segment_end, 1),
                    type="misc_feature",
                    qualifiers={
                        "label": "%s - From %s" % (q.id, q.source),
                        "name": q.id,
                        "source": q.source,
                        "price": q.price,
                        "lead_time": q.lead_time,
                    },
                ) for q in self.plan
            ]
            record.features = features + record.features
        return record
コード例 #3
0
def write_record(
    record,
    target,
    file_format="genbank",
    remove_locationless_features=True,
    max_name_length=20,
):
    """Write a record as genbank, fasta, etc. via Biopython, with fixes.

    Parameters
    ----------
    record
      A biopython record

    target
      Path to a file or filelike object.

    file_format
      Format, either Genbank or fasta

    remove_locationless_features
      If True, will remove all features whose location is None, to avoid a
      Biopython bug

    max_name_length
      The record's name will be truncated if longer than this (also here to
      avoid a biopython bug).

    """
    record = deepcopy(record)
    if remove_locationless_features:
        record.features = [
            f for f in record.features if f.location is not None
        ]
    record.name = record.name[:max_name_length]
    if str(record.seq.alphabet.__class__.__name__) != "DNAAlphabet":
        record.seq.alphabet = DNAAlphabet()
    if hasattr(target, "open"):
        target = target.open("w")
    SeqIO.write(record, target, file_format)
コード例 #4
0
ファイル: tools.py プロジェクト: jeqka24/CUBA
def records_from_data_files(data_files, use_file_names_as_ids=False):
    records = []
    for file_ in data_files:
        circular = ("circular" not in file_) or file_.circular
        if file_.name.lower().endswith("zip"):
            records += records_from_zip_file(
                file_, use_file_names_as_ids=use_file_names_as_ids)
            continue
        recs, fmt = records_from_data_file(file_)
        single_record = len(recs) == 1
        for i, record in enumerate(recs):
            record.circular = circular
            record.linear = not circular
            name_no_extension = "".join(file_.name.split(".")[:-1])
            name = name_no_extension + ("" if single_record else ("%04d" % i))
            name = name.replace(" ", "_")
            UNKNOWN_IDS = [
                "None",
                "",
                "<unknown id>",
                ".",
                "EXPORTED",
                "<unknown name>",
                "Exported",
            ]
            record.seq.alphabet = DNAAlphabet()
            # Sorry for this parts, it took a lot of "whatever works".
            # keep your part names under 20c and pointless, and everything
            # will be good
            if str(record.id).strip() in UNKNOWN_IDS:
                record.id = name
            if str(record.name).strip() in UNKNOWN_IDS:
                record.name = name
            record.file_name = name_no_extension
            if use_file_names_as_ids and single_record:
                basename = os.path.basename(record.source_file)
                basename_no_extension = os.path.splitext(basename)[0]
                record.id = basename_no_extension
        records += recs
    return records
コード例 #5
0
    def assemble_with(self,
                      other,
                      annotate_homology=False,
                      annotation_type="homology"):
        connector_str = str(self.seq.right_end)
        connector = SeqRecord(Seq(connector_str))
        if annotate_homology:
            self.annotate_connector(connector, annotation_type=annotation_type)
        selfc = SeqRecord(
            seq=Seq(str(self.seq)),
            features=self.features,
            annotations=self.annotations,
        )
        new_record = SeqRecord.__add__(selfc, connector).__add__(other)
        new_record.seq = self.seq + other.seq
        new_record.__class__ = StickyEndFragment

        if has_dna_alphabet:  # Biopython <1.78
            new_record.seq.alphabet = DNAAlphabet()
        new_record.annotations["molecule_type"] = "DNA"

        return new_record
コード例 #6
0
 def assemble_with(self,
                   other,
                   annotate_homology=False,
                   annotation_type="misc_feature",
                   **qualifiers):
     connector = SeqRecord(Seq(str(self.seq.right_end)))
     if len(qualifiers) == 0:
         qualifiers = {"label": "homology"}
     if annotate_homology:
         connector.features = [
             SeqFeature(FeatureLocation(0, len(connector), 1),
                        type=annotation_type,
                        qualifiers=qualifiers)
         ]
     selfc = SeqRecord(seq=Seq(str(self.seq)),
                       features=self.features,
                       annotations=self.annotations)
     new_record = SeqRecord.__add__(selfc, connector).__add__(other)
     new_record.seq = self.seq + other.seq
     new_record.__class__ = StickyEndsSeqRecord
     new_record.seq.alphabet = DNAAlphabet()
     return new_record
コード例 #7
0
    def get_list_of_seqrecords_from_collection(self) -> list:
        collection_from_client_reference = None
        list_of_seq_records_from_collection = None
        single_gene_record = None

        collection_from_client_reference = self.get_collection()

        try:
            collection_cursor = collection_from_client_reference. \
                find({})

            list_of_seq_records_from_collection = []

            for document in collection_cursor:
                single_gene_record = GeneDTO()

                single_gene_record.gene_id = document[Constants.GENE_ID]
                single_gene_record.sequence = document[Constants.SEQUENCE]

                if single_gene_record.sequence is None:
                    continue

                record = SeqRecord(Seq(single_gene_record.sequence,
                                       DNAAlphabet()),
                                   id=single_gene_record.gene_id)

                list_of_seq_records_from_collection.append(record)

            if len(list_of_seq_records_from_collection) == 0:
                list_of_seq_records_from_collection = None

            return list_of_seq_records_from_collection

        except Exception as error:
            print(
                'Caught exception getting all elements of collection as SeqRecords list: '
                + repr(error))
コード例 #8
0
def sequence_to_biopython_record(sequence,
                                 id="<unknown id>",
                                 name="same_as_id",
                                 features=()):
    """Return a SeqRecord of the sequence, ready to be Genbanked."""

    if has_dna_alphabet:
        seqrecord = SeqRecord(
            Seq(sequence, alphabet=DNAAlphabet()),
            id=id,
            name=id if name == "same_as_id" else name,
            features=list(features),
        )
    else:
        seqrecord = SeqRecord(
            Seq(sequence),
            id=id,
            name=id if name == "same_as_id" else name,
            features=list(features),
        )

    seqrecord.annotations["molecule_type"] = "DNA"

    return seqrecord
コード例 #9
0
                for distance_formula in ['dstar', 'ARS2015']:
                    kmer_distance_matrices[distance_formula] = dict()
                    finite_counts_matrices[distance_formula] = dict()
                    for k_i in k:
                        kmer_distance_matrices[distance_formula][
                            k_i] = zero_distance_matrix(species_names)
                        finite_counts_matrices[distance_formula][
                            k_i] = zero_distance_matrix(species_names)
                # Prepare concatenated sequence object
                sample_ids = [
                    sample.name for sample in itertools.chain.from_iterable(
                        base_embedding.values())
                ]
                concatenated_sequences = dict(
                    (sample_id,
                     SeqRecord(Seq('', DNAAlphabet()),
                               id=sample_id,
                               name=sample_id,
                               description=sample_id))
                    for sample_id in sample_ids)

                for gene in range(nr_genes):
                    # generate gene tree and sequences
                    # for each set of sequence, and for each distance formula and value of k, generate a k-mer distance matrix. sum these matrices for all genes
                    # also store the concatenated sequences
                    coalescent = EmbeddedGeneForest(tree, base_embedding)
                    coalescent.coalesce(theta)
                    genetree = coalescent.genetree()
                    with TemporaryDirectory() as tmpdir:
                        sequences = mutate_indelible(genetree,
                                                     m,
コード例 #10
0
def clean_seqs(fasta_in,fasta_out=None,filter_include_expression=None,filter_exclude_expression=None,bp_ranges=None,start_date=None,end_date=None,ungap=None):
    iso_date_re = re.compile(r'(\d{4}-\d{2}-\d{2})')

    bp_ranges = bp_ranges or []
    
    bp_range_str          = "_".join([str(t[0])+"-"+str(t[1])+"bp" for t in bp_ranges])
    start_date_str        = "" if not start_date else "starting_"+start_date.strftime("%Y-%m-%d")
    end_date_str          = "" if not end_date else "ending_"+end_date.strftime("%Y-%m-%d")
    filter_include_str    = "" if not filter_include_expression else "only_subset_by_filter"
    filter_exclude_str    = "" if not filter_exclude_expression else "excluding_some_by_filter"
    output_summary_string = "_".join(s for s in [bp_range_str,start_date_str,end_date_str,filter_include_str,filter_exclude_str] if len(s)>0)

    if len(output_summary_string)>0:
        output_summary_string="_"+output_summary_string

    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in.name))[0]
    out_basedir       = os.path.realpath(os.path.dirname(fasta_in.name))

    out_filepath = fasta_out or os.path.join(out_basedir,in_fasta_basename+"_cleaned"+output_summary_string+".fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    if filter_include_expression:
        filter_include_re = re.compile(filter_include_expression)
    if filter_exclude_expression:
        filter_exclude_re = re.compile(filter_exclude_expression)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=80) # wrap=None
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in.name, "fasta"):
            should_output=True
            if filter_include_expression:
                should_output=False
                if filter_include_re.search(record.id) or filter_include_re.search(record.description):
                    should_output=True
            
            if filter_exclude_expression and (filter_exclude_re.search(record.id) or filter_exclude_re.search(record.description)):
                should_output=False

            if start_date:
                for field in [record.description,record.id]:
                    match = iso_date_re.search(field)
                    if match:
                        seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d")
                        if seq_date<start_date:
                            should_output=False

            if end_date:
                for field in [record.description,record.id]:
                    match = iso_date_re.search(field)
                    if match:
                        seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d")
                        if seq_date>end_date:
                            should_output=False
            
            if should_output:
                
                if len(bp_ranges)==0:
                    record.seq=MutableSeq(str(record.seq).upper(), DNAAlphabet())
                else:
                    output_seq=MutableSeq("", DNAAlphabet())
                    for start,end in bp_ranges:
                        start-=1 # remove one since biopython seqs are zero-indexed
                        # end-=1 # remove one since biopython seqs are zero-indexed; not needed because slice upper is exclusive
                        start=max(start,0) # bound to limit of sequence
                        end=min(end,len(record)) # bound to limit of sequence
                        output_seq+=record.seq[start:end]
                    record.seq=Seq(str(output_seq).upper(),DNAAlphabet())

                if ungap!=None:
                    record.seq=Seq(str(record.seq).upper(),DNAAlphabet()).ungap(ungap)

                #record.id=copy.deepcopy(record.id).replace(" ",CHARACTER_TO_USE)
                #record.description=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE)
                # set the id to the description, which is the ID in the case of GISAID
                # and remove the description. 
                record.id=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE)
                record.description=""
                fasta_out.write_record(record)
コード例 #11
0
seqFile = open(args.seq, 'r')
seqDict = SeqIO.to_dict(SeqIO.parse(seqFile, "fasta"))

for key, seq in seqDict.iteritems():
    seqDict[key] = seq.upper()

for line in countsFile:
    vals = line.split()
    seqName = vals[0]
    state = 0
    if (seqName not in seqDict):
        state = 2
    else:
        kmer = vals[1]
        kmerrc = Seq.Seq(kmer, DNAAlphabet()).reverse_complement()
        kmerPos = seqDict[seqName].seq.find(kmer)
        kmerrcPos = seqDict[seqName].seq.find(kmerrc)
        pos = kmerPos
        if (kmerrcPos >= 0):
            pos = kmerrcPos
        if (pos >= 0):
            distToFlank = min(pos, len(seqDict[seqName].seq) - pos)
            #            print str(pos) + " " + str(distToFlank) + " " + str(len(seqDict[seqName].seq)) + " " + seqName
            if (distToFlank < args.flank):
                state = 1
            else:
                state = 0
        else:
            state = 2
    outFile.write(line.strip() + "\t" + str(state) + "\n")
コード例 #12
0
def load_records_from_files(files=None,
                            folder=None,
                            use_file_names_as_ids=False):
    """Automatically convert files or a folder's content to biopython records.

    Parameters
    ----------

    files
      A list of path to files. A ``folder`` can be provided instead.

    folder
      A path to a folder containing sequence files.

    use_file_names_as_ids
      If True, for every file containing a single record, the file name
      (without extension) will be set as the record's ID.
    """
    if files is not None:
        for file in files:
            if isinstance(file, str) and not os.path.exists(file):
                raise IOError("File %s not found" % file)

    if folder is not None:
        files = [f._path for f in flametree.file_tree(folder)._all_files]
    records = []
    for filepath in files:
        filename = os.path.basename(filepath)
        if filename.lower().endswith("zip"):
            records += _load_records_from_zip_file(
                filepath, use_file_names_as_ids=use_file_names_as_ids)
            continue
        recs, fmt = load_records_from_file(filepath)
        single_record = len(recs) == 1
        for i, record in enumerate(recs):
            name_no_extension = "".join(filename.split(".")[:-1])
            name = name_no_extension + ("" if single_record else ("%04d" % i))
            name = name.replace(" ", "_")
            UNKNOWN_IDS = [
                "None",
                "",
                "<unknown id>",
                ".",
                "EXPORTED",
                "<unknown name>",
                "Exported",
            ]

            if has_dna_alphabet:  # Biopython <1.78
                record.seq.alphabet = DNAAlphabet()
            record.annotations["molecule_type"] = "DNA"

            # Sorry for this parts, it took a lot of "whatever works".
            # keep your part names under 20c and pointless, and everything
            # will be good
            if str(record.id).strip() in UNKNOWN_IDS:
                record.id = name
            if str(record.id).strip() in UNKNOWN_IDS:
                record.id = name
            record.file_name = name_no_extension
            if use_file_names_as_ids and single_record:
                basename = os.path.basename(record.source_file)
                basename_no_extension = os.path.splitext(basename)[0]
                record.id = basename_no_extension
        records += recs
    return records
コード例 #13
0
#!/usr/bin/python

import logging
import sys

from Bio.Seq import MutableSeq
from Bio.Seq import Seq
from Bio.Alphabet import DNAAlphabet
from optparse import OptionParser
import sequence_utils

ALPHABET = DNAAlphabet()
DEFAULT_SEQ_BASES = 20
START_HIS = Seq('ATGCATCATCACCATCACCAC', ALPHABET)
INV_CAP_LINKER = Seq('GCTAGCGTTGATCGGGCACGTAAGAG', ALPHABET)


def MakeOpts():
    """Returns an OptionParser object with all the default options."""
    opt_parser = OptionParser()
    opt_parser.add_option(
        "-i",
        "--input_filename",
        dest="input_filename",
        help="The filename of the sequence to make primers for.")
    opt_parser.add_option(
        "-o",
        "--overlap_length",
        type="int",
        dest="overlap_length",
        default=20,
コード例 #14
0
def translate_chain_force_in_frame(chain):
    nt = sequence_force_in_frame(chain, replace=False)
    return Seq(nt.replace('-', 'N'), DNAAlphabet()).translate().tostring()
コード例 #15
0
# with open('d:\\x.txt', 'w') as fp:
#     fp.write(inp)
project = json.loads(inp)
blocks = project['parts']
seq = ''
features = []
start = 0
for block in blocks:
    seq += block['sequence']
    end = start + len(block['sequence'])
    features.append(
        SeqFeature(FeatureLocation(start, end, strand=block['strand']),
                   type=block['featureType'],
                   id=block['name']))
    start = end

sequence = Seq(seq, DNAAlphabet())
# features = [SeqFeature(FeatureLocation(1, 3, strand=1), type="CDS"), SeqFeature(FeatureLocation(5, 7, strand=-1), type="intron", id="someid",qualifiers={"quqqli":"bar"})]
seqRecord = SeqRecord(sequence, features=features)
# print(seqRecord)

string_io = io.StringIO()
SeqIO.write(seqRecord, string_io, 'genbank')
print(json.dumps({"content": string_io.getvalue()}))

# # with open('temp.gb', 'w') as fp:
# fileName = '%s.gb'%randomString()
# SeqIO.write(seqRecord, '../public/genbank/%s'%fileName, 'genbank')
# print(json.dumps({"fileURL": 'genbank/%s'%fileName}))
# print(json.dumps({"gb":fp.getvalue()}))
コード例 #16
0
def main(argv):
    indelible_model = 'JC'
    indelible_model = 'LAV0.01a'
    theta = 0.01
    mu = 1
    #k = (1,2)
    k = (1,2,3,4,5)
    m = 100
    n = 5
    nr_genes = 10
    nr_sims = 1
    nr_rows = 3
    nr_cols = 3
    a_max = 0.74
    b_max = 0.74
    #a_max = 0.3
    #b_max = 0.2
    kmer_methods = ['CoalescentJCNJ', 'CoalescentJCLS', 'JCNJ','dstarNJ','concatdJCNJ']
    #kmer_methods = ['dstarNJ','concatdJCNJ']
    distance_formulas = ['ARS2015', 'alignment_based']
    multiple_alignment_method = 'clustalo'
    alignment_method = 'stretcher'
    N = theta/mu
    db_file = 'db.sql'

    try:
        opts, args = getopt.getopt(argv,"hk:m:n:o:",["indelible_model=","theta=","genes=","sims=","rows=","cols=","a_max=","b_max="])
    except getopt.GetoptError as err:
        # print usage information and exit:
        print(str(err))
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage()
            sys.exit()
        elif opt == "--theta":
            theta = float(arg)
            N = theta
        elif opt == "-k":
            k = tuple([int(k_i) for k_i in re.sub("[()]","",arg).split(",")])
        elif opt == "-m":
            m = int(arg)
        elif opt == "-o":
            db_file = arg
        elif opt == "-n":
            n = int(arg)
        elif opt == "--genes":
            nr_genes = int(arg)
        elif opt == "--method":
            method = arg 
        elif opt == "--distance_formula":
            distance_formula = arg
        elif opt == "--sims":
            nr_sims = int(arg)
        elif opt == "--rows":
            nr_rows = int(arg)
        elif opt == "--cols":
            nr_cols = int(arg)
        elif opt == "--a_max":
            a_max = float(arg)
        elif opt == "--b_max":
            b_max = float(arg)
        elif opt == "--indelible_model":
            indelible_model = arg
        elif opt == "--reconstruct_only":
            reconstruct_only = True
    usage()

    import logging
    sqla_logger = logging.getLogger('sqlalchemy.engine.base.Engine')
    sqla_logger.propagate = False
    sqla_logger.addHandler(logging.FileHandler('/tmp/sqla.log'))

    from sqlalchemy import create_engine
    engine = create_engine('sqlite:///{:s}'.format(os.path.abspath(db_file)), echo=True, convert_unicode=True)
    from sqlalchemy.orm import sessionmaker
    Session = sessionmaker(bind=engine)
    session = Session()
    Base.metadata.create_all(engine)


    import resource
    sim_set = None
    print('Simulating sequence data for a {:d}x{:d} Huelsenbeck diagram with {:d} simulations for each tree, and {:d} gene trees per simulation, using the following parameters: theta={:.2e}, m={:d}, indelible_model={:s}\n'.format(nr_rows,nr_cols,nr_sims,nr_genes,theta,m,indelible_model))
    sim_set = HuelsenbeckSimulationSet(rows=nr_rows, cols=nr_cols, nr_sims=nr_sims, theta=theta, indelible_model=indelible_model, genes=nr_genes, m=m, n=n, a_max=a_max, b_max=b_max)
    for method in ['raxml']:
        tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=None, alignment_method=multiple_alignment_method, k=None)
    session.add(sim_set)
    session.add(tree_estimate_set)
    for row,b in enumerate([(row+1)*(b_max)/nr_rows for row in range(nr_rows)]):
        for col,a in enumerate([(col+1)*(a_max)/nr_cols for col in range(nr_cols)]):
            t_a = abs(-3.0/4.0*log(1-4.0/3.0*a)/mu)
            t_b = abs(-3.0/4.0*log(1-4.0/3.0*b)/mu)
            tree = huelsenbeck_tree(t_a,t_b,5)
            tree_newick = ")".join(tree.format('newick').split(")")[:-1])+")"
            print(tree_newick)

            xtree = XTree(tree,dict((clade,set([clade.name])) for clade in tree.get_terminals()))
            #print(','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtree.get_splits()])+" (t_a,t_b) = ({:f},{:f}): ".format(t_a,t_b))

            species_set = sorted(tree.get_terminals(),key=lambda species: species.name)
            n = len(species_set)
            species_names = [species.name for species in species_set]
            genes = [GeneLineage(name='s{:d}'.format(i)) for i,_ in enumerate(range(len(species_set)))]
            gene_embedding = dict(zip(species_set,[[gene] for gene in genes]))
            for sim_it in range(nr_sims):
                # Create simulation objects
                sim = Simulation(tree=tree_newick, theta=theta, indelible_model=indelible_model, genes=nr_genes, m=m, n=n)
                session.add(sim)
                huel_sim = HuelsenbeckSimulation(simulation_set=sim_set, simulation=sim, row=row, col=col)
                session.add(huel_sim)
                # Prepare kmer distance matrices to be used to compute averages
                kmer_distance_matrices = dict()
                finite_counts_matrices = dict()
                for distance_formula in ['dstar', 'ARS2015']:
                    kmer_distance_matrices[distance_formula] = dict()
                    finite_counts_matrices[distance_formula] = dict()
                    for k_i in k:
                        kmer_distance_matrices[distance_formula][k_i] = zero_distance_matrix(species_names)
                        finite_counts_matrices[distance_formula][k_i] = zero_distance_matrix(species_names)
                # Prepare concatenated sequence object
                sample_ids = [sample.name for sample in itertools.chain.from_iterable(gene_embedding.values())]
                concatenated_sequences = dict((sample_id,SeqRecord(Seq('',DNAAlphabet()),id=sample_id,name=sample_id,description=sample_id)) for sample_id in sample_ids)

                for gene in range(nr_genes):
                    # generate gene tree and sequences
                    # for each set of sequence, and for each distance formula and value of k, generate a k-mer distance matrix. sum these matrices for all genes
                    # also store the concatenated sequences
                    coalescent = EmbeddedGeneForest(tree, gene_embedding)
                    coalescent.coalesce(theta)
                    genetree = coalescent.genetree()
                    with TemporaryDirectory() as tmpdir:
                        sequences = mutate_indelible(genetree, m, tmpdir, indelible_model, aligned=False)
                        aligned_sequences = SeqIO.to_dict(align_sequences(sequences, multiple_alignment_method, tmpdir))
                        for sample_id in sample_ids:
                            concatenated_sequences[sample_id] += aligned_sequences[sample_id]
                    for distance_formula in ['dstar', 'ARS2015']:
                        for k_i in k:
                            if distance_formula == 'ARS2015':
                                dm = kmer_distance_matrix(sequences, k_i, normalized_kmer_distance, grouping=gene_embedding)
                                #print(dm)
                            elif distance_formula == 'dstar':
                                dm = kmer_distance_matrix(sequences, k_i, dstar_kmer_distance, grouping=gene_embedding)
                            else:
                                raise Exception
                            finite_counts_matrices[distance_formula][k_i] += dm.isfinite()
                            kmer_distance_matrices[distance_formula][k_i] += dm.nantozero()
                            #print(kmer_distance_matrices[distance_formula][k_i])
                for distance_formula in ['dstar', 'ARS2015']:
                    for k_i in k:
                        #print(finite_counts_matrices[distance_formula][k_i])
                        avg_dm = kmer_distance_matrices[distance_formula][k_i]/finite_counts_matrices[distance_formula][k_i]
                        #if distance_formula == 'dstar':
                        #    print(finite_counts_matrices[distance_formula][k_i])
                        #    print(avg_dm)
                        kdm = kmer_distance_matrix_from_dm(avg_dm, sim, distance_formula, None, k_i)
                        session.add(kdm)
                jc_dm = kmer_distance_matrix(concatenated_sequences.values(), 1, aligned_kmer_distance, alignment_fn=stretcher_alignment, grouping=gene_embedding)
                #print(jc_dm)
                kdm = kmer_distance_matrix_from_dm(jc_dm, sim, 'concatdJC', alignment_method, 1)
                session.add(kdm)
                # reconstruct from concatenated sequences using raxml
                with TemporaryDirectory() as tmpdir:
                    t0 = time.clock()
                    xtreehat = RAxML(concatenated_sequences.values(), gene_embedding, tmpdir)
                    t1 = time.clock()

                success = int(xtree.displays(xtreehat))
                print(success)
                tree_estimate = TreeEstimate(simulation=sim, method=tree_estimate_set.method, distance_formula=tree_estimate_set.distance_formula, k=tree_estimate_set.k, splits=','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()]), success=int(xtree.displays(xtreehat)), dt=t1-t0)
                session.add(tree_estimate)
                #session.commit()
                huel_tree_estimate = HuelsenbeckTreeEstimate(tree_estimate_set=tree_estimate_set, tree_estimate=tree_estimate, huelsenbeck_simulation=huel_sim)
            session.add(huel_tree_estimate)

    session.commit()

    # create tree_estimate sets
    for method in kmer_methods:
        if method in ['CoalescentJCNJ', 'CoalescentJCLS', 'JCNJ']:
            distance_formula = 'ARS2015'
            tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=None, k=",".join([str(k_i) for k_i in k]))
        elif method == 'dstarNJ':
            distance_formula = 'dstar'
            tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=None, k=",".join([str(k_i) for k_i in k]))
        elif method == 'concatdJCNJ':
            distance_formula = 'concatdJC'
            #alignment_method = 'clustalo'
            tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=alignment_method, k="1")
        session.add(tree_estimate_set)
        session.commit()

    # fetch tree_estimate sets that do not require full sequence data
    tree_estimate_sets = session.query(HuelsenbeckTreeEstimateSet).\
                                join(HuelsenbeckTreeEstimateSet.simulation_set).\
                                filter(HuelsenbeckTreeEstimateSet.method.in_(kmer_methods)). \
                                filter(HuelsenbeckTreeEstimateSet.simulation_set==sim_set).all()

    # run tree_estimates
    for tree_estimate_set in tree_estimate_sets:
        method = tree_estimate_set.method
        print(method)
        distance_formula = tree_estimate_set.distance_formula
        #alignment_method = tree_estimate_set.alignment_method
        try:
            k = [int(k_i) for k_i in tree_estimate_set.k.split(",")]
        except AttributeError:
            k = None
        for huel_sim in tree_estimate_set.simulation_set.huelsenbeck_simulations:
            sim = huel_sim.simulation
            treedata = sim.tree
            handle = StringIO(treedata)
            #print(handle.read())
            tree = Phylo.read(handle, "newick")
            xtree = XTree(tree,dict((clade,set([clade.name])) for clade in tree.get_terminals()))
            kmer_distance_matrices = dict((kdm.k,kdm.to_dm()) for kdm in sim.kmer_distance_matrices if kdm.k in k and kdm.distance_formula==distance_formula)

            t0 = time.clock()
            if method == 'CoalescentJCLS':
                xtreehat = TreeMinDistanceFromFiveTaxonCoalescentJCExpectedKmerDistanceParameterizationMap(kmer_distance_matrices)
            elif method == 'CoalescentJCNJ':
                xtreehat = NJArgMinSumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap(kmer_distance_matrices)
            elif method == 'JCNJ':
                #for k,dm in kmer_distance_matrices.items():
                #    print dm
                adjusted_distance_matrices = dict((k,JCKmerDistanceMatrixAdjustment(kmer_distance_matrix,k)) for k,kmer_distance_matrix in kmer_distance_matrices.items()) 
                #for k,dm in adjusted_distance_matrices.items():
                #    print dm
                xtreehat = NJ(adjusted_distance_matrices)
            elif method == 'dstarNJ':
                #for _,dm in kmer_distance_matrices.items():
                #    print dm
                xtreehat = NJ(kmer_distance_matrices)
            elif method == 'concatdJCNJ':
                adjusted_distance_matrices = {1:JCKmerDistanceMatrixAdjustment(kmer_distance_matrices[1],1)}
                xtreehat = NJ(adjusted_distance_matrices)
            else:
                raise(Exception)
            t1 = time.clock()

            success = int(xtree.displays(xtreehat))
            print(','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtree.get_splits()])+" (t_a,t_b) = ({:f},{:f}): ".format(t_a,t_b)+','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()])+" ({:d})".format(success))
            #print(k)
            tree_estimate = TreeEstimate(simulation=sim, method=method, distance_formula=distance_formula, k=",".join([str(k_i) for k_i in k]), splits=','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()]), success=int(xtree.displays(xtreehat)), dt=t1-t0)
            session.add(tree_estimate)
            #session.commit()
            huel_tree_estimate = HuelsenbeckTreeEstimate(tree_estimate_set=tree_estimate_set, tree_estimate=tree_estimate, huelsenbeck_simulation=huel_sim)
            session.add(huel_tree_estimate)
    session.commit()
コード例 #17
0
		yield ''.join(myseq)

def MultiExpandSeq(seq, positions):
	myseq = list(seq.tostring())
	for bases in itertools.combinations('ATCG', len(positions)):
		for i, pos in enumerate(positions):
			myseq[pos] = bases[i]
			yield ''.join(myseq)

seq = Seq(''.join(['TCAGCAGGACGCACTGACC',
		   'GAATTCTACTAGT',
		   #'TAATAGAAATAATTTTGTTTAACTTTA',
		   #'CAACAGAAACAACCCCGCCCAATCCCA',
  		   'ACACACACACACACACACACACACACA',
		   'AGGGGATTAATTATGCATCATCACCATCACCACG']),
	  DNAAlphabet())

seq_str = seq.tostring()
print seq_str

for i, c in enumerate(seq_str):
	print '%d, %s' % (i, c)

#rbs_start = seq_str.find('AGGAGG')
rbs_start = seq_str.find('AGGGGATTAA')

#spacer_loc = seq_str.find('ACACACACATGCAT')
spacer_loc = seq_str.find('TAATTATGCATCAT')

test_positions = range(rbs_start, rbs_start + len(seq[rbs_start:spacer_loc]))
print 'Testing positions', test_positions
コード例 #18
0
class TranspositionTest(unittest.TestCase):
    TARGET = Seq('GATCTAAAGAGGAGAAAGGATCTATGGATAAGAAATACTCAATAGGCTTAGCTATCGGCACAAATAGCGTCGGATGGGCGGTGATCACTGATGAATATAAGGTTCCGTCTAAAAAGTTCAAGGTTCTGGGAAATACAGACCGCCACAGTATCAAAAAAAATCTTATAGGGGCTCTTTTATTTGACAGTGGAGAGACAGCGGAAGCGACTCGTCTCAAACGGACAGCTCGTAGAAGGTATACACGTCGGAAGAATCGTATTTGTTATCTACAGGAGATTTTTTCAAATGAGATGGCGAAAGTAGATGATAGTTTCTTTCATCGACTTGAAGAGTCTTTTTTGGTGGAAGAAGACAAGAAGCATGAACGTCATCCTATTTTTGGAAATATAGTAGATGAAGTTGCTTATCATGAGAAATATCCAACTATCTATCATCTGCGAAAAAAATTGGTAGATTCTACTGATAAAGCGGATTTGCGCTTAATCTATTTGGCCTTAGCGCATATGATTAAGTTTCGTGGTCATTTTTTGATTGAGGGAGATTTAAATCCTGATAATAGTGATGTGGACAAACTATTTATCCAGTTGGTACAAACCTACAATCAATTATTTGAAGAAAACCCTATTAACGCAAGTGGAGTAGATGCTAAAGCGATTCTTTCTGCACGATTGAGTAAATCAAGACGATTAGAAAATCTCATTGCTCAGCTCCCCGGTGAGAAGAAAAATGGCTTATTTGGGAATCTCATTGCTTTGTCATTGGGTTTGACCCCTAATTTTAAATCAAATTTTGATTTGGCAGAAGATGCTAAATTACAGCTTTCAAAAGATACTTACGATGATGATTTAGATAATTTATTGGCGCAAATTGGAGATCAATATGCTGATTTGTTTTTGGCAGCTAAGAATTTATCAGATGCTATTTTACTTTCAGATATCCTAAGAGTAAATACTGAAATAACTAAGGCTCCCCTATCAGCTTCAATGATTAAACGCTACGATGAACATCATCAAGACTTGACTCTTTTAAAAGCTTTAGTTCGACAACAACTTCCAGAAAAGTATAAAGAAATCTTTTTTGATCAATCAAAAAACGGATATGCAGGTTATATTGATGGGGGAGCTAGCCAAGAAGAATTTTATAAATTTATCAAACCAATTTTAGAAAAAATGGATGGTACTGAGGAATTATTGGTGAAACTAAATCGTGAAGATTTGCTGCGCAAGCAACGGACCTTTGACAACGGCTCTATTCCCCATCAAATTCACTTGGGTGAGCTGCATGCTATTTTGAGAAGACAAGAAGACTTTTATCCATTTTTAAAAGACAATCGTGAGAAGATTGAAAAAATCTTGACTTTTCGAATTCCTTATTATGTTGGTCCATTGGCGCGTGGCAATAGTCGTTTTGCATGGATGACTCGGAAGTCTGAAGAAACAATTACCCCATGGAATTTTGAAGAAGTTGTCGATAAAGGTGCTTCAGCTCAATCATTTATTGAACGCATGACAAACTTTGATAAAAATCTTCCAAATGAAAAAGTACTACCAAAACATAGTTTGCTTTATGAGTATTTTACGGTTTATAACGAATTGACAAAGGTCAAATATGTTACTGAAGGAATGCGAAAACCAGCATTTCTTTCAGGTGAACAGAAGAAAGCCATTGTTGATTTACTCTTCAAAACAAATCGAAAAGTAACCGTTAAGCAATTAAAAGAAGATTATTTCAAAAAAATAGAATGTTTTGATAGTGTTGAAATTTCAGGAGTTGAAGATAGATTTAATGCTTCATTAGGTACCTACCATGATTTGCTAAAAATTATTAAAGATAAAGATTTTTTGGATAATGAAGAAAATGAAGATATCTTAGAGGATATTGTTTTAACATTGACCTTATTTGAAGATAGGGAGATGATTGAGGAAAGACTTAAAACATATGCTCACCTCTTTGATGATAAGGTGATGAAACAGCTTAAACGTCGCCGTTATACTGGTTGGGGACGTTTGTCTCGAAAATTGATTAATGGTATTAGGGATAAGCAATCTGGCAAAACAATATTAGATTTTTTGAAATCAGATGGTTTTGCCAATCGCAATTTTATGCAGCTGATCCATGATGATAGTTTGACATTTAAAGAAGACATTCAAAAAGCACAAGTGTCTGGACAAGGCGATAGTTTACATGAACATATTGCAAATTTAGCTGGTAGCCCTGCTATTAAAAAAGGTATTTTACAGACTGTAAAAGTTGTTGATGAATTGGTCAAAGTAATGGGGCGGCATAAGCCAGAAAATATCGTTATTGAAATGGCACGTGAAAATCAGACAACTCAAAAGGGCCAGAAAAATTCGCGAGAGCGTATGAAACGAATCGAAGAAGGTATCAAAGAATTAGGAAGTCAGATTCTTAAAGAGCATCCTGTTGAAAATACTCAATTGCAAAATGAAAAGCTCTATCTCTATTATCTCCAAAATGGAAGAGACATGTATGTGGACCAAGAATTAGATATTAATCGTTTAAGTGATTATGATGTCGATGCCATTGTTCCACAAAGTTTCCTTAAAGACGATTCAATAGACAATAAGGTCTTAACGCGTTCTGATAAAAATCGTGGTAAATCGGATAACGTTCCAAGTGAAGAAGTAGTCAAAAAGATGAAAAACTATTGGAGACAACTTCTAAACGCCAAGTTAATCACTCAACGTAAGTTTGATAATTTAACGAAAGCTGAACGTGGAGGTTTGAGTGAACTTGATAAAGCTGGTTTTATCAAACGCCAATTGGTTGAAACTCGCCAAATCACTAAGCATGTGGCACAAATTTTGGATAGTCGCATGAATACTAAATACGATGAAAATGATAAACTTATTCGAGAGGTTAAAGTGATTACCTTAAAATCTAAATTAGTTTCTGACTTCCGAAAAGATTTCCAATTCTATAAAGTACGTGAGATTAACAATTACCATCATGCCCATGATGCGTATCTAAATGCCGTCGTTGGAACTGCTTTGATTAAGAAATATCCAAAACTTGAATCGGAGTTTGTCTATGGTGATTATAAAGTTTATGATGTTCGTAAAATGATTGCTAAGTCTGAGCAAGAAATAGGCAAAGCAACCGCAAAATATTTCTTTTACTCTAATATCATGAACTTCTTCAAAACAGAAATTACACTTGCAAATGGAGAGATTCGCAAACGCCCTCTAATCGAAACTAATGGGGAAACTGGAGAAATTGTCTGGGATAAAGGGCGAGATTTTGCCACAGTGCGCAAAGTATTGTCCATGCCCCAAGTCAATATTGTCAAGAAAACAGAAGTACAGACAGGCGGATTCTCCAAGGAGTCAATTTTACCAAAAAGAAATTCGGACAAGCTTATTGCTCGTAAAAAAGACTGGGATCCAAAAAAATATGGTGGTTTTGATAGTCCAACGGTAGCTTATTCAGTCCTAGTGGTTGCTAAGGTGGAAAAAGGGAAATCGAAGAAGTTAAAATCCGTTAAAGAGTTACTAGGGATCACAATTATGGAAAGAAGTTCCTTTGAAAAAAATCCGATTGACTTTTTAGAAGCTAAAGGATATAAGGAAGTTAAAAAAGACTTAATCATTAAACTACCTAAATATAGTCTTTTTGAGTTAGAAAACGGTCGTAAACGGATGCTGGCTAGTGCCGGAGAATTACAAAAAGGAAATGAGCTGGCTCTGCCAAGCAAATATGTGAATTTTTTATATTTAGCTAGTCATTATGAAAAGTTGAAGGGTAGTCCAGAAGATAACGAACAAAAACAATTGTTTGTGGAGCAGCATAAGCATTATTTAGATGAGATTATTGAGCAAATCAGTGAATTTTCTAAGCGTGTTATTTTAGCAGATGCCAATTTAGATAAAGTTCTTAGTGCATATAACAAACATAGAGACAAACCAATACGTGAACAAGCAGAAAATATTATTCATTTATTTACGTTGACGAATCTTGGAGCTCCCGCTGCTTTTAAATATTTTGATACAACAATTGATCGTAAACGATATACGTCTACAAAAGAAGTTTTAGATGCCACTCTTATCCATCAATCCATCACTGGTCTTTATGAAACACGCATTGATTTGAGTCAGCTAGGAGGTGACTAACTCGA', DNAAlphabet())
    INSERT_SEQ = Seq('CAACGTCGGCGTGTGACGGTGCGCAGGTCGTGCTCGAAGTTAAGTACATG', DNAAlphabet())
    FIXED_5P = Seq('TGCATC')
    FIXED_3P = Seq('GCGTCA')
    ORF_START = 24
    LINKER_GEN = AmbiguousSequence('BCT')
    
    def testNoLinker(self):
        insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P,
                                     extra_bp_5p='T')
        for tn_id in xrange(1000):
            tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START)
            for frag_id in xrange(1000):
                read = tn_gen.Shear(frag_id)
                record = read.ToSeqRecord()
                
                self.assertIsNotNone(record.description)
                self.assertEquals(record.name, read.id_str)
                self.assertEquals(record.id, read.id_str)
    
    def testFrame(self):
        # TODO: test this more
        insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P,
                                     extra_bp_5p='T')
        ins = 9 + self.ORF_START - 1
        tn_gen = Transposition(1023, insert_gen, self.TARGET, self.ORF_START,
                               insertion_site=ins)
        self.assertTrue(tn_gen.in_frame)
        self.assertEquals(14, tn_gen.expected_insertion_site)

    def testLinker(self):
        insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P,
                                     extra_bp_5p='T', linker_gen=self.LINKER_GEN)
        for tn_id in xrange(1000):
            tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START)
            for frag_id in xrange(1000):
                read = tn_gen.Shear(frag_id)
                record = read.ToSeqRecord()
                
                self.assertIsNotNone(record.description)
                self.assertEquals(record.name, read.id_str)
                self.assertEquals(record.id, read.id_str)

    def testSerialize(self):
        insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P,
                                     extra_bp_5p='T', linker_gen=self.LINKER_GEN)
        
        records = []
        
        for tn_id in xrange(10):
            tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START)
            records.extend([tn_gen.Shear(i).ToSeqRecord() for i in xrange(100)])
        
        outfile = StringIO()
        writer = FastaIO.FastaWriter(outfile)
        writer.write_header()
        writer.write_records(records)
        
        # Parse the generated output.
        infile = StringIO(outfile.getvalue())
        parsed = SeqIO.parse(infile, 'fasta')
        expected_info_keys = Fragment.INFO_DICT_KEYS
        for record in parsed:
            info_dict = Fragment.ParseInfoDict(record)
            self.assertListEqual(sorted(info_dict.keys()), sorted(expected_info_keys))
コード例 #19
0
def main():
    chk_blastn = not (os.system('blastn -version'))
    chk_primer3 = int(os.system('primer3_core -help'))  #  65280
    chk_genome_index = os.path.isfile(genome + '.idx')
    chk_gff_index = os.path.isfile(gff3 + '.db.idx')
    if chk_primer3 != 65280:
        print >> sys.stderr, "primer3_core not found on $PATH!!"
        sys.exit(1)
    if not chk_blastn:
        print >> sys.stderr, "blastn not found on $PATH!!"
        sys.exit(1)
    if not chk_genome_index:
        print >> sys.stderr, "Creating Genome index"
        idx = SeqIO.index_db(genome + '.idx', genome, "fasta")
        idx.close()
        print >> sys.stderr, " --DONE--"
    genome_idx = SeqIO.index_db(genome + '.idx')
    #print genome_idx['Chr01'].seq[1:10]
    if not chk_gff_index:
        print >> sys.stderr, "Creating GFF index.."
        gffutils.create_db(gff3, gff3 + '.db.idx')
        print >> sys.stderr, " --DONE--"
    gff_db = gffutils.FeatureDB(gff3 + '.db.idx')
    if not os.path.isdir(temp_file_loc):
        os.makedirs(temp_file_loc)
    if not os.path.isfile(genome + '.nin'):
        cmd = "makeblastdb -in %s -dbtype nucl" % (genome)
        print >> sys.stderr, "Creating genome blast db..\n" + cmd
        os.system(cmd)
    if not os.path.isfile(transcripts_seq + '.nin'):
        cmd = "makeblastdb -in %s -dbtype nucl" % (transcripts_seq)
        print >> sys.stderr, "Creating transcript seq blast db..\n" + cmd
        os.system(cmd)
    print_params(opt_primer_len, surround_exon_junc, min_five_overlap,
                 min_three_overlap, min_product_size, max_product_size,
                 min_GC_perc, min_Tm, min_distance_3_primer,
                 Blastn_extra_params)

    target_mRNAs = open(gene_list, 'r')
    for l in target_mRNAs:
        l = l.rstrip()
        mRNA = gff_db[l]
        print >> sys.stderr, "Processing: %s %s %d bp" % (
            mRNA.id, mRNA.strand, (mRNA.end - mRNA.start) + 1)
        mRNA_seq = ''
        primer_3_seq = ''  # Just keeping a tack.
        exon_array = list()
        exon_junctions_list = list()  # list of exon-exon junctions
        exons = sorted([f.id for f in gff_db.children(l, featuretype='exon')])
        if exons[0] != l + '.exon.1':
            print >> sys.stderr, "Exon ID format mismatched!!. Expecting %s. Found %s" % (
                l + '.exon.1', exons[0])
        #else:
        # print >> sys.stderr, 'Correct exon format!!'

        for e in xrange(1, len(exons) + 1):
            # For each exon segment
            ex = gff_db[l + '.exon.' + str(e)]
            exon_array.append(ex)  ## keepin order constant
            #print ex.id
            s = genome_idx[ex.seqid].seq[
                ex.start -
                1:ex.end]  # as one 1-indexed gff3, python uses 0-index base
            if mRNA.strand == '-':
                s = s.reverse_complement()
            mRNA_seq += str(s)
            primer_3_seq += str(s)
            if e != len(exons):
                primer_3_seq += '-'
                exon_junctions_list.append(len(mRNA_seq))
        ## Store mRNA sequence as Bio.Seq() object
        mRNA_seq = Seq(mRNA_seq, DNAAlphabet())

        ## Generate Primer3_input file
        output = open(
            os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt'), 'w')
        input_data = [
            'SEQUENCE_ID=' + mRNA.id, 'SEQUENCE_TEMPLATE=' + str(mRNA_seq),
            'PRIMER_TASK=pick_pcr_primers',
            'PRIMER_MIN_3_PRIME_OVERLAP_OF_JUNCTION=' + str(min_three_overlap),
            'PRIMER_MIN_5_PRIME_OVERLAP_OF_JUNCTION=' + str(min_five_overlap),
            'PRIMER_MIN_THREE_PRIME_DISTANCE=' + str(min_distance_3_primer),
            'PRIMER_OPT_SIZE=' + str(opt_primer_len),
            'PRIMER_MIN_SIZE=' + str(opt_primer_len - 2),
            'PRIMER_MAX_SIZE=' + str(opt_primer_len + 2),
            'PRIMER_MIN_GC=' + str(min_GC_perc), 'PRIMER_MAX_NS_ACCEPTED=1',
            'PRIMER_PRODUCT_SIZE_RANGE=' + str(min_product_size) + '-' +
            str(min_product_size), 'P3_FILE_FLAG=0', 'PRIMER_EXPLAIN_FLAG=1',
            'PRIMER_THERMODYNAMIC_PARAMETERS_PATH=/media/winterfell/kanhu/SOFTWARES/primer3-2.3.7/src/primer3_config/'
        ]
        if surround_exon_junc:
            input_data.append(
                'SEQUENCE_TARGET=' +
                " ".join([str(j - 50) + ',50' for j in exon_junctions_list]))
        else:
            input_data.append('SEQUENCE_OVERLAP_JUNCTION_LIST=' +
                              " ".join([str(j) for j in exon_junctions_list]))
        input_data.append('=')
        print >> output, "\n".join(input_data)
        output.close()

        ## RUN primer3_core
        print >> sys.stderr, "## RUNNING Primer3 ##"
        if generate_primer3_formatted_output:
            cmd = "primer3_core -format_output -output=%s  < %s" % (
                os.path.join(temp_file_loc,
                             mRNA.id + '.primer3_Formatted_output.txt'),
                os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt'))
            print >> sys.stderr, "\t ", cmd
            os.system(cmd)
        cmd = "primer3_core -output=%s  < %s" % (os.path.join(
            temp_file_loc, mRNA.id + '.primer3_detailed_output.txt'
        ), os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt'))
        print >> sys.stderr, "\t ", cmd
        os.system(cmd)

        ## Parse primer3 default output
        pri3_results = parse_primer3_detailed_output(
            os.path.join(temp_file_loc,
                         mRNA.id + '.primer3_detailed_output.txt'))
        if (not 'PRIMER_PAIR_NUM_RETURNED' in pri3_results) or (int(
                pri3_results['PRIMER_PAIR_NUM_RETURNED']) == 0):
            print >> sys.stderr, "\t No primer pairs found for mRNA: %s " % mRNA.id
            continue
        ### Generate fasta files of primers
        Fas_output = open(
            os.path.join(temp_file_loc, mRNA.id + '.primer3_output.fas'), 'w')
        for i in xrange(int(pri3_results['PRIMER_PAIR_NUM_RETURNED'])):
            print >> Fas_output, ">%s\n%s" % (
                'PRIMER_LEFT_' + str(i),
                pri3_results['PRIMER_LEFT_' + str(i) + '_SEQUENCE'])
            print >> Fas_output, ">%s\n%s" % (
                'PRIMER_RIGHT_' + str(i),
                pri3_results['PRIMER_RIGHT_' + str(i) + '_SEQUENCE'])
        Fas_output.close()
        print >> sys.stderr, "## RUNNING Blastn Vs genome ##"
        cmd = "blastn -db %s -query %s -outfmt 6 -out %s %s" % (
            genome, os.path.join(temp_file_loc,
                                 mRNA.id + '.primer3_output.fas'),
            os.path.join(temp_file_loc, mRNA.id +
                         '.primers.g.blastn_output.tsv'), Blastn_extra_params)
        print >> sys.stderr, "\t ", cmd
        os.system(cmd)
        genome_blastn_out_dict = parse_blastn_tab_output(
            os.path.join(temp_file_loc,
                         mRNA.id + '.primers.g.blastn_output.tsv'))
        print >> sys.stderr, "## RUNNING Blastn Vs Transcripts ##"
        cmd = "blastn -db %s -query %s -outfmt 6 -out %s %s" % (
            transcripts_seq,
            os.path.join(temp_file_loc, mRNA.id + '.primer3_output.fas'),
            os.path.join(temp_file_loc, mRNA.id +
                         '.primers.t.blastn_output.tsv'), Blastn_extra_params)
        print >> sys.stderr, "\t ", cmd
        os.system(cmd)
        transcript_blastn_out_dict = parse_blastn_tab_output(
            os.path.join(temp_file_loc,
                         mRNA.id + '.primers.t.blastn_output.tsv'))

        ## Final output
        output = open(mRNA.id + '.primer3_output.tsv', 'w')
        print >> output, "mRNA\tPRIMER_SERIAL_NO\tLEFT_PRIMER\tLEFT_GC_PERCENT\tLEFT_TM\tLEFT_HAIRPIN_TH\tLEFT_END_STABILITY\tRIGHT_PRIMER\tRIGHT_GC_PERCENT\tRIGHT_TM\tRIGHT_HAIRPIN_TH\tRIGHT_END_STABILITY\tLEFT_Genome_BLASTN_HITS\tRIGHT_Genome_BLASTN_HITS\tLEFT_transcrpt_BLASTN_HITS\tRIGHT_transcript_BLASTN_HITS"
        for i in xrange(int(pri3_results['PRIMER_PAIR_NUM_RETURNED'])):
            if not 'PRIMER_LEFT_' + str(i) in genome_blastn_out_dict:
                genome_blastn_out_dict['PRIMER_LEFT_' + str(i)] = 0
            if not 'PRIMER_RIGHT_' + str(i) in genome_blastn_out_dict:
                genome_blastn_out_dict['PRIMER_RIGHT_' + str(i)] = 0
            if not 'PRIMER_LEFT_' + str(i) in transcript_blastn_out_dict:
                transcript_blastn_out_dict['PRIMER_LEFT_' + str(i)] = 0
            if not 'PRIMER_RIGHT_' + str(i) in transcript_blastn_out_dict:
                transcript_blastn_out_dict['PRIMER_RIGHT_' + str(i)] = 0

            out = [
                mRNA.id,
                str(i), pri3_results['PRIMER_LEFT_' + str(i) + '_SEQUENCE'],
                pri3_results['PRIMER_LEFT_' + str(i) + '_GC_PERCENT'],
                pri3_results['PRIMER_LEFT_' + str(i) + '_TM'],
                pri3_results['PRIMER_LEFT_' + str(i) + '_HAIRPIN_TH'],
                pri3_results['PRIMER_LEFT_' + str(i) + '_END_STABILITY'],
                pri3_results['PRIMER_RIGHT_' + str(i) + '_SEQUENCE'],
                pri3_results['PRIMER_RIGHT_' + str(i) + '_GC_PERCENT'],
                pri3_results['PRIMER_RIGHT_' + str(i) + '_TM'],
                pri3_results['PRIMER_RIGHT_' + str(i) + '_HAIRPIN_TH'],
                pri3_results['PRIMER_RIGHT_' + str(i) + '_END_STABILITY'],
                str(genome_blastn_out_dict['PRIMER_LEFT_' + str(i)]),
                str(genome_blastn_out_dict['PRIMER_RIGHT_' + str(i)]),
                str(transcript_blastn_out_dict['PRIMER_LEFT_' + str(i)]),
                str(transcript_blastn_out_dict['PRIMER_RIGHT_' + str(i)])
            ]
            print >> output, "\t".join(out)
        output.close()
        print >> sys.stderr, "## Ploting "
        draw_primers(exon_array, mRNA.id)
    target_mRNAs.close()
    genome_idx.close()
コード例 #20
0
def change_biopython_record_sequence(record, new_seq):
    """Return a version of the record with the sequence set to new_seq"""
    new_record = deepcopy(record)
    new_record.seq = Seq(new_seq, alphabet=DNAAlphabet())
    return new_record
コード例 #21
0
ファイル: genes.py プロジェクト: anushchp/etfl
 def __init__(self, id, name, sequence, *args, **kwargs):
     Gene.__init__(self, id, name, *args, **kwargs)
     self.sequence = Seq(sequence, DNAAlphabet())
     self._rna = ''
     self._peptide = ''