def records_from_data_files(filepaths=None, folder=None): """Automatically convert files or a folder's content to Biopython records. """ if folder is not None: filepaths = [f._path for f in flametree.file_tree(folder)._all_files] records = [] for filepath in filepaths: filename = os.path.basename(filepath) if filename.lower().endswith("zip"): records += records_from_zip_file(filepath) continue recs, fmt = records_from_file(filepath) single_record = len(recs) == 1 for i, record in enumerate(recs): name_no_extension = "".join(filename.split(".")[:-1]) name = name_no_extension + ("" if single_record else ("%04d" % i)) name = name.replace(" ", "_") UNKNOWN_IDS = [ "None", "", "<unknown id>", ".", "EXPORTED", "<unknown name>", "Exported", ] if has_dna_alphabet: # Biopython <1.78 record.seq.alphabet = DNAAlphabet() record.annotations["molecule_type"] = "DNA" # Sorry for this parts, it took a lot of "whatever works". # keep your part names under 20c and pointless, and everything # will be good if str(record.id).strip() in UNKNOWN_IDS: record.id = name if str(record.name).strip() in UNKNOWN_IDS: record.name = name record.file_name = name_no_extension records += recs return records
def to_record(self, record=None, record_id=None): """Return a Biopython seqrecord of the quote. >>> record = to_record(solution) >>> # Let's plot with DnaVu: >>> from dnavu import create_record_plot >>> from bokeh.io import output_file, show >>> output_file("view.html") >>> plot = create_record_plot(record) >>> show(plot) """ if record_id is None: record_id = self.id if record is None: if has_dna_alphabet: # Biopython <1.78 record = SeqRecord(Seq(self.sequence, DNAAlphabet()), id=record_id) else: record = SeqRecord(Seq(self.sequence), id=record_id) record.annotations["molecule_type"] = "DNA" else: record = deepcopy(record) if self.plan is not None: features = [ SeqFeature( FeatureLocation(q.segment_start, q.segment_end, 1), type="misc_feature", qualifiers={ "label": "%s - From %s" % (q.id, q.source), "name": q.id, "source": q.source, "price": q.price, "lead_time": q.lead_time, }, ) for q in self.plan ] record.features = features + record.features return record
def write_record( record, target, file_format="genbank", remove_locationless_features=True, max_name_length=20, ): """Write a record as genbank, fasta, etc. via Biopython, with fixes. Parameters ---------- record A biopython record target Path to a file or filelike object. file_format Format, either Genbank or fasta remove_locationless_features If True, will remove all features whose location is None, to avoid a Biopython bug max_name_length The record's name will be truncated if longer than this (also here to avoid a biopython bug). """ record = deepcopy(record) if remove_locationless_features: record.features = [ f for f in record.features if f.location is not None ] record.name = record.name[:max_name_length] if str(record.seq.alphabet.__class__.__name__) != "DNAAlphabet": record.seq.alphabet = DNAAlphabet() if hasattr(target, "open"): target = target.open("w") SeqIO.write(record, target, file_format)
def records_from_data_files(data_files, use_file_names_as_ids=False): records = [] for file_ in data_files: circular = ("circular" not in file_) or file_.circular if file_.name.lower().endswith("zip"): records += records_from_zip_file( file_, use_file_names_as_ids=use_file_names_as_ids) continue recs, fmt = records_from_data_file(file_) single_record = len(recs) == 1 for i, record in enumerate(recs): record.circular = circular record.linear = not circular name_no_extension = "".join(file_.name.split(".")[:-1]) name = name_no_extension + ("" if single_record else ("%04d" % i)) name = name.replace(" ", "_") UNKNOWN_IDS = [ "None", "", "<unknown id>", ".", "EXPORTED", "<unknown name>", "Exported", ] record.seq.alphabet = DNAAlphabet() # Sorry for this parts, it took a lot of "whatever works". # keep your part names under 20c and pointless, and everything # will be good if str(record.id).strip() in UNKNOWN_IDS: record.id = name if str(record.name).strip() in UNKNOWN_IDS: record.name = name record.file_name = name_no_extension if use_file_names_as_ids and single_record: basename = os.path.basename(record.source_file) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension records += recs return records
def assemble_with(self, other, annotate_homology=False, annotation_type="homology"): connector_str = str(self.seq.right_end) connector = SeqRecord(Seq(connector_str)) if annotate_homology: self.annotate_connector(connector, annotation_type=annotation_type) selfc = SeqRecord( seq=Seq(str(self.seq)), features=self.features, annotations=self.annotations, ) new_record = SeqRecord.__add__(selfc, connector).__add__(other) new_record.seq = self.seq + other.seq new_record.__class__ = StickyEndFragment if has_dna_alphabet: # Biopython <1.78 new_record.seq.alphabet = DNAAlphabet() new_record.annotations["molecule_type"] = "DNA" return new_record
def assemble_with(self, other, annotate_homology=False, annotation_type="misc_feature", **qualifiers): connector = SeqRecord(Seq(str(self.seq.right_end))) if len(qualifiers) == 0: qualifiers = {"label": "homology"} if annotate_homology: connector.features = [ SeqFeature(FeatureLocation(0, len(connector), 1), type=annotation_type, qualifiers=qualifiers) ] selfc = SeqRecord(seq=Seq(str(self.seq)), features=self.features, annotations=self.annotations) new_record = SeqRecord.__add__(selfc, connector).__add__(other) new_record.seq = self.seq + other.seq new_record.__class__ = StickyEndsSeqRecord new_record.seq.alphabet = DNAAlphabet() return new_record
def get_list_of_seqrecords_from_collection(self) -> list: collection_from_client_reference = None list_of_seq_records_from_collection = None single_gene_record = None collection_from_client_reference = self.get_collection() try: collection_cursor = collection_from_client_reference. \ find({}) list_of_seq_records_from_collection = [] for document in collection_cursor: single_gene_record = GeneDTO() single_gene_record.gene_id = document[Constants.GENE_ID] single_gene_record.sequence = document[Constants.SEQUENCE] if single_gene_record.sequence is None: continue record = SeqRecord(Seq(single_gene_record.sequence, DNAAlphabet()), id=single_gene_record.gene_id) list_of_seq_records_from_collection.append(record) if len(list_of_seq_records_from_collection) == 0: list_of_seq_records_from_collection = None return list_of_seq_records_from_collection except Exception as error: print( 'Caught exception getting all elements of collection as SeqRecords list: ' + repr(error))
def sequence_to_biopython_record(sequence, id="<unknown id>", name="same_as_id", features=()): """Return a SeqRecord of the sequence, ready to be Genbanked.""" if has_dna_alphabet: seqrecord = SeqRecord( Seq(sequence, alphabet=DNAAlphabet()), id=id, name=id if name == "same_as_id" else name, features=list(features), ) else: seqrecord = SeqRecord( Seq(sequence), id=id, name=id if name == "same_as_id" else name, features=list(features), ) seqrecord.annotations["molecule_type"] = "DNA" return seqrecord
for distance_formula in ['dstar', 'ARS2015']: kmer_distance_matrices[distance_formula] = dict() finite_counts_matrices[distance_formula] = dict() for k_i in k: kmer_distance_matrices[distance_formula][ k_i] = zero_distance_matrix(species_names) finite_counts_matrices[distance_formula][ k_i] = zero_distance_matrix(species_names) # Prepare concatenated sequence object sample_ids = [ sample.name for sample in itertools.chain.from_iterable( base_embedding.values()) ] concatenated_sequences = dict( (sample_id, SeqRecord(Seq('', DNAAlphabet()), id=sample_id, name=sample_id, description=sample_id)) for sample_id in sample_ids) for gene in range(nr_genes): # generate gene tree and sequences # for each set of sequence, and for each distance formula and value of k, generate a k-mer distance matrix. sum these matrices for all genes # also store the concatenated sequences coalescent = EmbeddedGeneForest(tree, base_embedding) coalescent.coalesce(theta) genetree = coalescent.genetree() with TemporaryDirectory() as tmpdir: sequences = mutate_indelible(genetree, m,
def clean_seqs(fasta_in,fasta_out=None,filter_include_expression=None,filter_exclude_expression=None,bp_ranges=None,start_date=None,end_date=None,ungap=None): iso_date_re = re.compile(r'(\d{4}-\d{2}-\d{2})') bp_ranges = bp_ranges or [] bp_range_str = "_".join([str(t[0])+"-"+str(t[1])+"bp" for t in bp_ranges]) start_date_str = "" if not start_date else "starting_"+start_date.strftime("%Y-%m-%d") end_date_str = "" if not end_date else "ending_"+end_date.strftime("%Y-%m-%d") filter_include_str = "" if not filter_include_expression else "only_subset_by_filter" filter_exclude_str = "" if not filter_exclude_expression else "excluding_some_by_filter" output_summary_string = "_".join(s for s in [bp_range_str,start_date_str,end_date_str,filter_include_str,filter_exclude_str] if len(s)>0) if len(output_summary_string)>0: output_summary_string="_"+output_summary_string in_fasta_basename = os.path.splitext(os.path.basename(fasta_in.name))[0] out_basedir = os.path.realpath(os.path.dirname(fasta_in.name)) out_filepath = fasta_out or os.path.join(out_basedir,in_fasta_basename+"_cleaned"+output_summary_string+".fasta") if os.path.exists(out_filepath): raise IOError("%s already exists; skipping..." % out_filepath) if filter_include_expression: filter_include_re = re.compile(filter_include_expression) if filter_exclude_expression: filter_exclude_re = re.compile(filter_exclude_expression) with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=80) # wrap=None fasta_out.write_header() for record in SeqIO.parse(fasta_in.name, "fasta"): should_output=True if filter_include_expression: should_output=False if filter_include_re.search(record.id) or filter_include_re.search(record.description): should_output=True if filter_exclude_expression and (filter_exclude_re.search(record.id) or filter_exclude_re.search(record.description)): should_output=False if start_date: for field in [record.description,record.id]: match = iso_date_re.search(field) if match: seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d") if seq_date<start_date: should_output=False if end_date: for field in [record.description,record.id]: match = iso_date_re.search(field) if match: seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d") if seq_date>end_date: should_output=False if should_output: if len(bp_ranges)==0: record.seq=MutableSeq(str(record.seq).upper(), DNAAlphabet()) else: output_seq=MutableSeq("", DNAAlphabet()) for start,end in bp_ranges: start-=1 # remove one since biopython seqs are zero-indexed # end-=1 # remove one since biopython seqs are zero-indexed; not needed because slice upper is exclusive start=max(start,0) # bound to limit of sequence end=min(end,len(record)) # bound to limit of sequence output_seq+=record.seq[start:end] record.seq=Seq(str(output_seq).upper(),DNAAlphabet()) if ungap!=None: record.seq=Seq(str(record.seq).upper(),DNAAlphabet()).ungap(ungap) #record.id=copy.deepcopy(record.id).replace(" ",CHARACTER_TO_USE) #record.description=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE) # set the id to the description, which is the ID in the case of GISAID # and remove the description. record.id=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE) record.description="" fasta_out.write_record(record)
seqFile = open(args.seq, 'r') seqDict = SeqIO.to_dict(SeqIO.parse(seqFile, "fasta")) for key, seq in seqDict.iteritems(): seqDict[key] = seq.upper() for line in countsFile: vals = line.split() seqName = vals[0] state = 0 if (seqName not in seqDict): state = 2 else: kmer = vals[1] kmerrc = Seq.Seq(kmer, DNAAlphabet()).reverse_complement() kmerPos = seqDict[seqName].seq.find(kmer) kmerrcPos = seqDict[seqName].seq.find(kmerrc) pos = kmerPos if (kmerrcPos >= 0): pos = kmerrcPos if (pos >= 0): distToFlank = min(pos, len(seqDict[seqName].seq) - pos) # print str(pos) + " " + str(distToFlank) + " " + str(len(seqDict[seqName].seq)) + " " + seqName if (distToFlank < args.flank): state = 1 else: state = 0 else: state = 2 outFile.write(line.strip() + "\t" + str(state) + "\n")
def load_records_from_files(files=None, folder=None, use_file_names_as_ids=False): """Automatically convert files or a folder's content to biopython records. Parameters ---------- files A list of path to files. A ``folder`` can be provided instead. folder A path to a folder containing sequence files. use_file_names_as_ids If True, for every file containing a single record, the file name (without extension) will be set as the record's ID. """ if files is not None: for file in files: if isinstance(file, str) and not os.path.exists(file): raise IOError("File %s not found" % file) if folder is not None: files = [f._path for f in flametree.file_tree(folder)._all_files] records = [] for filepath in files: filename = os.path.basename(filepath) if filename.lower().endswith("zip"): records += _load_records_from_zip_file( filepath, use_file_names_as_ids=use_file_names_as_ids) continue recs, fmt = load_records_from_file(filepath) single_record = len(recs) == 1 for i, record in enumerate(recs): name_no_extension = "".join(filename.split(".")[:-1]) name = name_no_extension + ("" if single_record else ("%04d" % i)) name = name.replace(" ", "_") UNKNOWN_IDS = [ "None", "", "<unknown id>", ".", "EXPORTED", "<unknown name>", "Exported", ] if has_dna_alphabet: # Biopython <1.78 record.seq.alphabet = DNAAlphabet() record.annotations["molecule_type"] = "DNA" # Sorry for this parts, it took a lot of "whatever works". # keep your part names under 20c and pointless, and everything # will be good if str(record.id).strip() in UNKNOWN_IDS: record.id = name if str(record.id).strip() in UNKNOWN_IDS: record.id = name record.file_name = name_no_extension if use_file_names_as_ids and single_record: basename = os.path.basename(record.source_file) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension records += recs return records
#!/usr/bin/python import logging import sys from Bio.Seq import MutableSeq from Bio.Seq import Seq from Bio.Alphabet import DNAAlphabet from optparse import OptionParser import sequence_utils ALPHABET = DNAAlphabet() DEFAULT_SEQ_BASES = 20 START_HIS = Seq('ATGCATCATCACCATCACCAC', ALPHABET) INV_CAP_LINKER = Seq('GCTAGCGTTGATCGGGCACGTAAGAG', ALPHABET) def MakeOpts(): """Returns an OptionParser object with all the default options.""" opt_parser = OptionParser() opt_parser.add_option( "-i", "--input_filename", dest="input_filename", help="The filename of the sequence to make primers for.") opt_parser.add_option( "-o", "--overlap_length", type="int", dest="overlap_length", default=20,
def translate_chain_force_in_frame(chain): nt = sequence_force_in_frame(chain, replace=False) return Seq(nt.replace('-', 'N'), DNAAlphabet()).translate().tostring()
# with open('d:\\x.txt', 'w') as fp: # fp.write(inp) project = json.loads(inp) blocks = project['parts'] seq = '' features = [] start = 0 for block in blocks: seq += block['sequence'] end = start + len(block['sequence']) features.append( SeqFeature(FeatureLocation(start, end, strand=block['strand']), type=block['featureType'], id=block['name'])) start = end sequence = Seq(seq, DNAAlphabet()) # features = [SeqFeature(FeatureLocation(1, 3, strand=1), type="CDS"), SeqFeature(FeatureLocation(5, 7, strand=-1), type="intron", id="someid",qualifiers={"quqqli":"bar"})] seqRecord = SeqRecord(sequence, features=features) # print(seqRecord) string_io = io.StringIO() SeqIO.write(seqRecord, string_io, 'genbank') print(json.dumps({"content": string_io.getvalue()})) # # with open('temp.gb', 'w') as fp: # fileName = '%s.gb'%randomString() # SeqIO.write(seqRecord, '../public/genbank/%s'%fileName, 'genbank') # print(json.dumps({"fileURL": 'genbank/%s'%fileName})) # print(json.dumps({"gb":fp.getvalue()}))
def main(argv): indelible_model = 'JC' indelible_model = 'LAV0.01a' theta = 0.01 mu = 1 #k = (1,2) k = (1,2,3,4,5) m = 100 n = 5 nr_genes = 10 nr_sims = 1 nr_rows = 3 nr_cols = 3 a_max = 0.74 b_max = 0.74 #a_max = 0.3 #b_max = 0.2 kmer_methods = ['CoalescentJCNJ', 'CoalescentJCLS', 'JCNJ','dstarNJ','concatdJCNJ'] #kmer_methods = ['dstarNJ','concatdJCNJ'] distance_formulas = ['ARS2015', 'alignment_based'] multiple_alignment_method = 'clustalo' alignment_method = 'stretcher' N = theta/mu db_file = 'db.sql' try: opts, args = getopt.getopt(argv,"hk:m:n:o:",["indelible_model=","theta=","genes=","sims=","rows=","cols=","a_max=","b_max="]) except getopt.GetoptError as err: # print usage information and exit: print(str(err)) usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt == "--theta": theta = float(arg) N = theta elif opt == "-k": k = tuple([int(k_i) for k_i in re.sub("[()]","",arg).split(",")]) elif opt == "-m": m = int(arg) elif opt == "-o": db_file = arg elif opt == "-n": n = int(arg) elif opt == "--genes": nr_genes = int(arg) elif opt == "--method": method = arg elif opt == "--distance_formula": distance_formula = arg elif opt == "--sims": nr_sims = int(arg) elif opt == "--rows": nr_rows = int(arg) elif opt == "--cols": nr_cols = int(arg) elif opt == "--a_max": a_max = float(arg) elif opt == "--b_max": b_max = float(arg) elif opt == "--indelible_model": indelible_model = arg elif opt == "--reconstruct_only": reconstruct_only = True usage() import logging sqla_logger = logging.getLogger('sqlalchemy.engine.base.Engine') sqla_logger.propagate = False sqla_logger.addHandler(logging.FileHandler('/tmp/sqla.log')) from sqlalchemy import create_engine engine = create_engine('sqlite:///{:s}'.format(os.path.abspath(db_file)), echo=True, convert_unicode=True) from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=engine) session = Session() Base.metadata.create_all(engine) import resource sim_set = None print('Simulating sequence data for a {:d}x{:d} Huelsenbeck diagram with {:d} simulations for each tree, and {:d} gene trees per simulation, using the following parameters: theta={:.2e}, m={:d}, indelible_model={:s}\n'.format(nr_rows,nr_cols,nr_sims,nr_genes,theta,m,indelible_model)) sim_set = HuelsenbeckSimulationSet(rows=nr_rows, cols=nr_cols, nr_sims=nr_sims, theta=theta, indelible_model=indelible_model, genes=nr_genes, m=m, n=n, a_max=a_max, b_max=b_max) for method in ['raxml']: tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=None, alignment_method=multiple_alignment_method, k=None) session.add(sim_set) session.add(tree_estimate_set) for row,b in enumerate([(row+1)*(b_max)/nr_rows for row in range(nr_rows)]): for col,a in enumerate([(col+1)*(a_max)/nr_cols for col in range(nr_cols)]): t_a = abs(-3.0/4.0*log(1-4.0/3.0*a)/mu) t_b = abs(-3.0/4.0*log(1-4.0/3.0*b)/mu) tree = huelsenbeck_tree(t_a,t_b,5) tree_newick = ")".join(tree.format('newick').split(")")[:-1])+")" print(tree_newick) xtree = XTree(tree,dict((clade,set([clade.name])) for clade in tree.get_terminals())) #print(','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtree.get_splits()])+" (t_a,t_b) = ({:f},{:f}): ".format(t_a,t_b)) species_set = sorted(tree.get_terminals(),key=lambda species: species.name) n = len(species_set) species_names = [species.name for species in species_set] genes = [GeneLineage(name='s{:d}'.format(i)) for i,_ in enumerate(range(len(species_set)))] gene_embedding = dict(zip(species_set,[[gene] for gene in genes])) for sim_it in range(nr_sims): # Create simulation objects sim = Simulation(tree=tree_newick, theta=theta, indelible_model=indelible_model, genes=nr_genes, m=m, n=n) session.add(sim) huel_sim = HuelsenbeckSimulation(simulation_set=sim_set, simulation=sim, row=row, col=col) session.add(huel_sim) # Prepare kmer distance matrices to be used to compute averages kmer_distance_matrices = dict() finite_counts_matrices = dict() for distance_formula in ['dstar', 'ARS2015']: kmer_distance_matrices[distance_formula] = dict() finite_counts_matrices[distance_formula] = dict() for k_i in k: kmer_distance_matrices[distance_formula][k_i] = zero_distance_matrix(species_names) finite_counts_matrices[distance_formula][k_i] = zero_distance_matrix(species_names) # Prepare concatenated sequence object sample_ids = [sample.name for sample in itertools.chain.from_iterable(gene_embedding.values())] concatenated_sequences = dict((sample_id,SeqRecord(Seq('',DNAAlphabet()),id=sample_id,name=sample_id,description=sample_id)) for sample_id in sample_ids) for gene in range(nr_genes): # generate gene tree and sequences # for each set of sequence, and for each distance formula and value of k, generate a k-mer distance matrix. sum these matrices for all genes # also store the concatenated sequences coalescent = EmbeddedGeneForest(tree, gene_embedding) coalescent.coalesce(theta) genetree = coalescent.genetree() with TemporaryDirectory() as tmpdir: sequences = mutate_indelible(genetree, m, tmpdir, indelible_model, aligned=False) aligned_sequences = SeqIO.to_dict(align_sequences(sequences, multiple_alignment_method, tmpdir)) for sample_id in sample_ids: concatenated_sequences[sample_id] += aligned_sequences[sample_id] for distance_formula in ['dstar', 'ARS2015']: for k_i in k: if distance_formula == 'ARS2015': dm = kmer_distance_matrix(sequences, k_i, normalized_kmer_distance, grouping=gene_embedding) #print(dm) elif distance_formula == 'dstar': dm = kmer_distance_matrix(sequences, k_i, dstar_kmer_distance, grouping=gene_embedding) else: raise Exception finite_counts_matrices[distance_formula][k_i] += dm.isfinite() kmer_distance_matrices[distance_formula][k_i] += dm.nantozero() #print(kmer_distance_matrices[distance_formula][k_i]) for distance_formula in ['dstar', 'ARS2015']: for k_i in k: #print(finite_counts_matrices[distance_formula][k_i]) avg_dm = kmer_distance_matrices[distance_formula][k_i]/finite_counts_matrices[distance_formula][k_i] #if distance_formula == 'dstar': # print(finite_counts_matrices[distance_formula][k_i]) # print(avg_dm) kdm = kmer_distance_matrix_from_dm(avg_dm, sim, distance_formula, None, k_i) session.add(kdm) jc_dm = kmer_distance_matrix(concatenated_sequences.values(), 1, aligned_kmer_distance, alignment_fn=stretcher_alignment, grouping=gene_embedding) #print(jc_dm) kdm = kmer_distance_matrix_from_dm(jc_dm, sim, 'concatdJC', alignment_method, 1) session.add(kdm) # reconstruct from concatenated sequences using raxml with TemporaryDirectory() as tmpdir: t0 = time.clock() xtreehat = RAxML(concatenated_sequences.values(), gene_embedding, tmpdir) t1 = time.clock() success = int(xtree.displays(xtreehat)) print(success) tree_estimate = TreeEstimate(simulation=sim, method=tree_estimate_set.method, distance_formula=tree_estimate_set.distance_formula, k=tree_estimate_set.k, splits=','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()]), success=int(xtree.displays(xtreehat)), dt=t1-t0) session.add(tree_estimate) #session.commit() huel_tree_estimate = HuelsenbeckTreeEstimate(tree_estimate_set=tree_estimate_set, tree_estimate=tree_estimate, huelsenbeck_simulation=huel_sim) session.add(huel_tree_estimate) session.commit() # create tree_estimate sets for method in kmer_methods: if method in ['CoalescentJCNJ', 'CoalescentJCLS', 'JCNJ']: distance_formula = 'ARS2015' tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=None, k=",".join([str(k_i) for k_i in k])) elif method == 'dstarNJ': distance_formula = 'dstar' tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=None, k=",".join([str(k_i) for k_i in k])) elif method == 'concatdJCNJ': distance_formula = 'concatdJC' #alignment_method = 'clustalo' tree_estimate_set = HuelsenbeckTreeEstimateSet(simulation_set=sim_set, method=method, distance_formula=distance_formula, alignment_method=alignment_method, k="1") session.add(tree_estimate_set) session.commit() # fetch tree_estimate sets that do not require full sequence data tree_estimate_sets = session.query(HuelsenbeckTreeEstimateSet).\ join(HuelsenbeckTreeEstimateSet.simulation_set).\ filter(HuelsenbeckTreeEstimateSet.method.in_(kmer_methods)). \ filter(HuelsenbeckTreeEstimateSet.simulation_set==sim_set).all() # run tree_estimates for tree_estimate_set in tree_estimate_sets: method = tree_estimate_set.method print(method) distance_formula = tree_estimate_set.distance_formula #alignment_method = tree_estimate_set.alignment_method try: k = [int(k_i) for k_i in tree_estimate_set.k.split(",")] except AttributeError: k = None for huel_sim in tree_estimate_set.simulation_set.huelsenbeck_simulations: sim = huel_sim.simulation treedata = sim.tree handle = StringIO(treedata) #print(handle.read()) tree = Phylo.read(handle, "newick") xtree = XTree(tree,dict((clade,set([clade.name])) for clade in tree.get_terminals())) kmer_distance_matrices = dict((kdm.k,kdm.to_dm()) for kdm in sim.kmer_distance_matrices if kdm.k in k and kdm.distance_formula==distance_formula) t0 = time.clock() if method == 'CoalescentJCLS': xtreehat = TreeMinDistanceFromFiveTaxonCoalescentJCExpectedKmerDistanceParameterizationMap(kmer_distance_matrices) elif method == 'CoalescentJCNJ': xtreehat = NJArgMinSumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap(kmer_distance_matrices) elif method == 'JCNJ': #for k,dm in kmer_distance_matrices.items(): # print dm adjusted_distance_matrices = dict((k,JCKmerDistanceMatrixAdjustment(kmer_distance_matrix,k)) for k,kmer_distance_matrix in kmer_distance_matrices.items()) #for k,dm in adjusted_distance_matrices.items(): # print dm xtreehat = NJ(adjusted_distance_matrices) elif method == 'dstarNJ': #for _,dm in kmer_distance_matrices.items(): # print dm xtreehat = NJ(kmer_distance_matrices) elif method == 'concatdJCNJ': adjusted_distance_matrices = {1:JCKmerDistanceMatrixAdjustment(kmer_distance_matrices[1],1)} xtreehat = NJ(adjusted_distance_matrices) else: raise(Exception) t1 = time.clock() success = int(xtree.displays(xtreehat)) print(','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtree.get_splits()])+" (t_a,t_b) = ({:f},{:f}): ".format(t_a,t_b)+','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()])+" ({:d})".format(success)) #print(k) tree_estimate = TreeEstimate(simulation=sim, method=method, distance_formula=distance_formula, k=",".join([str(k_i) for k_i in k]), splits=','.join([''.join(split[0])+'|'+''.join(split[1]) for split in xtreehat.get_splits()]), success=int(xtree.displays(xtreehat)), dt=t1-t0) session.add(tree_estimate) #session.commit() huel_tree_estimate = HuelsenbeckTreeEstimate(tree_estimate_set=tree_estimate_set, tree_estimate=tree_estimate, huelsenbeck_simulation=huel_sim) session.add(huel_tree_estimate) session.commit()
yield ''.join(myseq) def MultiExpandSeq(seq, positions): myseq = list(seq.tostring()) for bases in itertools.combinations('ATCG', len(positions)): for i, pos in enumerate(positions): myseq[pos] = bases[i] yield ''.join(myseq) seq = Seq(''.join(['TCAGCAGGACGCACTGACC', 'GAATTCTACTAGT', #'TAATAGAAATAATTTTGTTTAACTTTA', #'CAACAGAAACAACCCCGCCCAATCCCA', 'ACACACACACACACACACACACACACA', 'AGGGGATTAATTATGCATCATCACCATCACCACG']), DNAAlphabet()) seq_str = seq.tostring() print seq_str for i, c in enumerate(seq_str): print '%d, %s' % (i, c) #rbs_start = seq_str.find('AGGAGG') rbs_start = seq_str.find('AGGGGATTAA') #spacer_loc = seq_str.find('ACACACACATGCAT') spacer_loc = seq_str.find('TAATTATGCATCAT') test_positions = range(rbs_start, rbs_start + len(seq[rbs_start:spacer_loc])) print 'Testing positions', test_positions
class TranspositionTest(unittest.TestCase): TARGET = Seq('GATCTAAAGAGGAGAAAGGATCTATGGATAAGAAATACTCAATAGGCTTAGCTATCGGCACAAATAGCGTCGGATGGGCGGTGATCACTGATGAATATAAGGTTCCGTCTAAAAAGTTCAAGGTTCTGGGAAATACAGACCGCCACAGTATCAAAAAAAATCTTATAGGGGCTCTTTTATTTGACAGTGGAGAGACAGCGGAAGCGACTCGTCTCAAACGGACAGCTCGTAGAAGGTATACACGTCGGAAGAATCGTATTTGTTATCTACAGGAGATTTTTTCAAATGAGATGGCGAAAGTAGATGATAGTTTCTTTCATCGACTTGAAGAGTCTTTTTTGGTGGAAGAAGACAAGAAGCATGAACGTCATCCTATTTTTGGAAATATAGTAGATGAAGTTGCTTATCATGAGAAATATCCAACTATCTATCATCTGCGAAAAAAATTGGTAGATTCTACTGATAAAGCGGATTTGCGCTTAATCTATTTGGCCTTAGCGCATATGATTAAGTTTCGTGGTCATTTTTTGATTGAGGGAGATTTAAATCCTGATAATAGTGATGTGGACAAACTATTTATCCAGTTGGTACAAACCTACAATCAATTATTTGAAGAAAACCCTATTAACGCAAGTGGAGTAGATGCTAAAGCGATTCTTTCTGCACGATTGAGTAAATCAAGACGATTAGAAAATCTCATTGCTCAGCTCCCCGGTGAGAAGAAAAATGGCTTATTTGGGAATCTCATTGCTTTGTCATTGGGTTTGACCCCTAATTTTAAATCAAATTTTGATTTGGCAGAAGATGCTAAATTACAGCTTTCAAAAGATACTTACGATGATGATTTAGATAATTTATTGGCGCAAATTGGAGATCAATATGCTGATTTGTTTTTGGCAGCTAAGAATTTATCAGATGCTATTTTACTTTCAGATATCCTAAGAGTAAATACTGAAATAACTAAGGCTCCCCTATCAGCTTCAATGATTAAACGCTACGATGAACATCATCAAGACTTGACTCTTTTAAAAGCTTTAGTTCGACAACAACTTCCAGAAAAGTATAAAGAAATCTTTTTTGATCAATCAAAAAACGGATATGCAGGTTATATTGATGGGGGAGCTAGCCAAGAAGAATTTTATAAATTTATCAAACCAATTTTAGAAAAAATGGATGGTACTGAGGAATTATTGGTGAAACTAAATCGTGAAGATTTGCTGCGCAAGCAACGGACCTTTGACAACGGCTCTATTCCCCATCAAATTCACTTGGGTGAGCTGCATGCTATTTTGAGAAGACAAGAAGACTTTTATCCATTTTTAAAAGACAATCGTGAGAAGATTGAAAAAATCTTGACTTTTCGAATTCCTTATTATGTTGGTCCATTGGCGCGTGGCAATAGTCGTTTTGCATGGATGACTCGGAAGTCTGAAGAAACAATTACCCCATGGAATTTTGAAGAAGTTGTCGATAAAGGTGCTTCAGCTCAATCATTTATTGAACGCATGACAAACTTTGATAAAAATCTTCCAAATGAAAAAGTACTACCAAAACATAGTTTGCTTTATGAGTATTTTACGGTTTATAACGAATTGACAAAGGTCAAATATGTTACTGAAGGAATGCGAAAACCAGCATTTCTTTCAGGTGAACAGAAGAAAGCCATTGTTGATTTACTCTTCAAAACAAATCGAAAAGTAACCGTTAAGCAATTAAAAGAAGATTATTTCAAAAAAATAGAATGTTTTGATAGTGTTGAAATTTCAGGAGTTGAAGATAGATTTAATGCTTCATTAGGTACCTACCATGATTTGCTAAAAATTATTAAAGATAAAGATTTTTTGGATAATGAAGAAAATGAAGATATCTTAGAGGATATTGTTTTAACATTGACCTTATTTGAAGATAGGGAGATGATTGAGGAAAGACTTAAAACATATGCTCACCTCTTTGATGATAAGGTGATGAAACAGCTTAAACGTCGCCGTTATACTGGTTGGGGACGTTTGTCTCGAAAATTGATTAATGGTATTAGGGATAAGCAATCTGGCAAAACAATATTAGATTTTTTGAAATCAGATGGTTTTGCCAATCGCAATTTTATGCAGCTGATCCATGATGATAGTTTGACATTTAAAGAAGACATTCAAAAAGCACAAGTGTCTGGACAAGGCGATAGTTTACATGAACATATTGCAAATTTAGCTGGTAGCCCTGCTATTAAAAAAGGTATTTTACAGACTGTAAAAGTTGTTGATGAATTGGTCAAAGTAATGGGGCGGCATAAGCCAGAAAATATCGTTATTGAAATGGCACGTGAAAATCAGACAACTCAAAAGGGCCAGAAAAATTCGCGAGAGCGTATGAAACGAATCGAAGAAGGTATCAAAGAATTAGGAAGTCAGATTCTTAAAGAGCATCCTGTTGAAAATACTCAATTGCAAAATGAAAAGCTCTATCTCTATTATCTCCAAAATGGAAGAGACATGTATGTGGACCAAGAATTAGATATTAATCGTTTAAGTGATTATGATGTCGATGCCATTGTTCCACAAAGTTTCCTTAAAGACGATTCAATAGACAATAAGGTCTTAACGCGTTCTGATAAAAATCGTGGTAAATCGGATAACGTTCCAAGTGAAGAAGTAGTCAAAAAGATGAAAAACTATTGGAGACAACTTCTAAACGCCAAGTTAATCACTCAACGTAAGTTTGATAATTTAACGAAAGCTGAACGTGGAGGTTTGAGTGAACTTGATAAAGCTGGTTTTATCAAACGCCAATTGGTTGAAACTCGCCAAATCACTAAGCATGTGGCACAAATTTTGGATAGTCGCATGAATACTAAATACGATGAAAATGATAAACTTATTCGAGAGGTTAAAGTGATTACCTTAAAATCTAAATTAGTTTCTGACTTCCGAAAAGATTTCCAATTCTATAAAGTACGTGAGATTAACAATTACCATCATGCCCATGATGCGTATCTAAATGCCGTCGTTGGAACTGCTTTGATTAAGAAATATCCAAAACTTGAATCGGAGTTTGTCTATGGTGATTATAAAGTTTATGATGTTCGTAAAATGATTGCTAAGTCTGAGCAAGAAATAGGCAAAGCAACCGCAAAATATTTCTTTTACTCTAATATCATGAACTTCTTCAAAACAGAAATTACACTTGCAAATGGAGAGATTCGCAAACGCCCTCTAATCGAAACTAATGGGGAAACTGGAGAAATTGTCTGGGATAAAGGGCGAGATTTTGCCACAGTGCGCAAAGTATTGTCCATGCCCCAAGTCAATATTGTCAAGAAAACAGAAGTACAGACAGGCGGATTCTCCAAGGAGTCAATTTTACCAAAAAGAAATTCGGACAAGCTTATTGCTCGTAAAAAAGACTGGGATCCAAAAAAATATGGTGGTTTTGATAGTCCAACGGTAGCTTATTCAGTCCTAGTGGTTGCTAAGGTGGAAAAAGGGAAATCGAAGAAGTTAAAATCCGTTAAAGAGTTACTAGGGATCACAATTATGGAAAGAAGTTCCTTTGAAAAAAATCCGATTGACTTTTTAGAAGCTAAAGGATATAAGGAAGTTAAAAAAGACTTAATCATTAAACTACCTAAATATAGTCTTTTTGAGTTAGAAAACGGTCGTAAACGGATGCTGGCTAGTGCCGGAGAATTACAAAAAGGAAATGAGCTGGCTCTGCCAAGCAAATATGTGAATTTTTTATATTTAGCTAGTCATTATGAAAAGTTGAAGGGTAGTCCAGAAGATAACGAACAAAAACAATTGTTTGTGGAGCAGCATAAGCATTATTTAGATGAGATTATTGAGCAAATCAGTGAATTTTCTAAGCGTGTTATTTTAGCAGATGCCAATTTAGATAAAGTTCTTAGTGCATATAACAAACATAGAGACAAACCAATACGTGAACAAGCAGAAAATATTATTCATTTATTTACGTTGACGAATCTTGGAGCTCCCGCTGCTTTTAAATATTTTGATACAACAATTGATCGTAAACGATATACGTCTACAAAAGAAGTTTTAGATGCCACTCTTATCCATCAATCCATCACTGGTCTTTATGAAACACGCATTGATTTGAGTCAGCTAGGAGGTGACTAACTCGA', DNAAlphabet()) INSERT_SEQ = Seq('CAACGTCGGCGTGTGACGGTGCGCAGGTCGTGCTCGAAGTTAAGTACATG', DNAAlphabet()) FIXED_5P = Seq('TGCATC') FIXED_3P = Seq('GCGTCA') ORF_START = 24 LINKER_GEN = AmbiguousSequence('BCT') def testNoLinker(self): insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P, extra_bp_5p='T') for tn_id in xrange(1000): tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START) for frag_id in xrange(1000): read = tn_gen.Shear(frag_id) record = read.ToSeqRecord() self.assertIsNotNone(record.description) self.assertEquals(record.name, read.id_str) self.assertEquals(record.id, read.id_str) def testFrame(self): # TODO: test this more insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P, extra_bp_5p='T') ins = 9 + self.ORF_START - 1 tn_gen = Transposition(1023, insert_gen, self.TARGET, self.ORF_START, insertion_site=ins) self.assertTrue(tn_gen.in_frame) self.assertEquals(14, tn_gen.expected_insertion_site) def testLinker(self): insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P, extra_bp_5p='T', linker_gen=self.LINKER_GEN) for tn_id in xrange(1000): tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START) for frag_id in xrange(1000): read = tn_gen.Shear(frag_id) record = read.ToSeqRecord() self.assertIsNotNone(record.description) self.assertEquals(record.name, read.id_str) self.assertEquals(record.id, read.id_str) def testSerialize(self): insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P, extra_bp_5p='T', linker_gen=self.LINKER_GEN) records = [] for tn_id in xrange(10): tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START) records.extend([tn_gen.Shear(i).ToSeqRecord() for i in xrange(100)]) outfile = StringIO() writer = FastaIO.FastaWriter(outfile) writer.write_header() writer.write_records(records) # Parse the generated output. infile = StringIO(outfile.getvalue()) parsed = SeqIO.parse(infile, 'fasta') expected_info_keys = Fragment.INFO_DICT_KEYS for record in parsed: info_dict = Fragment.ParseInfoDict(record) self.assertListEqual(sorted(info_dict.keys()), sorted(expected_info_keys))
def main(): chk_blastn = not (os.system('blastn -version')) chk_primer3 = int(os.system('primer3_core -help')) # 65280 chk_genome_index = os.path.isfile(genome + '.idx') chk_gff_index = os.path.isfile(gff3 + '.db.idx') if chk_primer3 != 65280: print >> sys.stderr, "primer3_core not found on $PATH!!" sys.exit(1) if not chk_blastn: print >> sys.stderr, "blastn not found on $PATH!!" sys.exit(1) if not chk_genome_index: print >> sys.stderr, "Creating Genome index" idx = SeqIO.index_db(genome + '.idx', genome, "fasta") idx.close() print >> sys.stderr, " --DONE--" genome_idx = SeqIO.index_db(genome + '.idx') #print genome_idx['Chr01'].seq[1:10] if not chk_gff_index: print >> sys.stderr, "Creating GFF index.." gffutils.create_db(gff3, gff3 + '.db.idx') print >> sys.stderr, " --DONE--" gff_db = gffutils.FeatureDB(gff3 + '.db.idx') if not os.path.isdir(temp_file_loc): os.makedirs(temp_file_loc) if not os.path.isfile(genome + '.nin'): cmd = "makeblastdb -in %s -dbtype nucl" % (genome) print >> sys.stderr, "Creating genome blast db..\n" + cmd os.system(cmd) if not os.path.isfile(transcripts_seq + '.nin'): cmd = "makeblastdb -in %s -dbtype nucl" % (transcripts_seq) print >> sys.stderr, "Creating transcript seq blast db..\n" + cmd os.system(cmd) print_params(opt_primer_len, surround_exon_junc, min_five_overlap, min_three_overlap, min_product_size, max_product_size, min_GC_perc, min_Tm, min_distance_3_primer, Blastn_extra_params) target_mRNAs = open(gene_list, 'r') for l in target_mRNAs: l = l.rstrip() mRNA = gff_db[l] print >> sys.stderr, "Processing: %s %s %d bp" % ( mRNA.id, mRNA.strand, (mRNA.end - mRNA.start) + 1) mRNA_seq = '' primer_3_seq = '' # Just keeping a tack. exon_array = list() exon_junctions_list = list() # list of exon-exon junctions exons = sorted([f.id for f in gff_db.children(l, featuretype='exon')]) if exons[0] != l + '.exon.1': print >> sys.stderr, "Exon ID format mismatched!!. Expecting %s. Found %s" % ( l + '.exon.1', exons[0]) #else: # print >> sys.stderr, 'Correct exon format!!' for e in xrange(1, len(exons) + 1): # For each exon segment ex = gff_db[l + '.exon.' + str(e)] exon_array.append(ex) ## keepin order constant #print ex.id s = genome_idx[ex.seqid].seq[ ex.start - 1:ex.end] # as one 1-indexed gff3, python uses 0-index base if mRNA.strand == '-': s = s.reverse_complement() mRNA_seq += str(s) primer_3_seq += str(s) if e != len(exons): primer_3_seq += '-' exon_junctions_list.append(len(mRNA_seq)) ## Store mRNA sequence as Bio.Seq() object mRNA_seq = Seq(mRNA_seq, DNAAlphabet()) ## Generate Primer3_input file output = open( os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt'), 'w') input_data = [ 'SEQUENCE_ID=' + mRNA.id, 'SEQUENCE_TEMPLATE=' + str(mRNA_seq), 'PRIMER_TASK=pick_pcr_primers', 'PRIMER_MIN_3_PRIME_OVERLAP_OF_JUNCTION=' + str(min_three_overlap), 'PRIMER_MIN_5_PRIME_OVERLAP_OF_JUNCTION=' + str(min_five_overlap), 'PRIMER_MIN_THREE_PRIME_DISTANCE=' + str(min_distance_3_primer), 'PRIMER_OPT_SIZE=' + str(opt_primer_len), 'PRIMER_MIN_SIZE=' + str(opt_primer_len - 2), 'PRIMER_MAX_SIZE=' + str(opt_primer_len + 2), 'PRIMER_MIN_GC=' + str(min_GC_perc), 'PRIMER_MAX_NS_ACCEPTED=1', 'PRIMER_PRODUCT_SIZE_RANGE=' + str(min_product_size) + '-' + str(min_product_size), 'P3_FILE_FLAG=0', 'PRIMER_EXPLAIN_FLAG=1', 'PRIMER_THERMODYNAMIC_PARAMETERS_PATH=/media/winterfell/kanhu/SOFTWARES/primer3-2.3.7/src/primer3_config/' ] if surround_exon_junc: input_data.append( 'SEQUENCE_TARGET=' + " ".join([str(j - 50) + ',50' for j in exon_junctions_list])) else: input_data.append('SEQUENCE_OVERLAP_JUNCTION_LIST=' + " ".join([str(j) for j in exon_junctions_list])) input_data.append('=') print >> output, "\n".join(input_data) output.close() ## RUN primer3_core print >> sys.stderr, "## RUNNING Primer3 ##" if generate_primer3_formatted_output: cmd = "primer3_core -format_output -output=%s < %s" % ( os.path.join(temp_file_loc, mRNA.id + '.primer3_Formatted_output.txt'), os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt')) print >> sys.stderr, "\t ", cmd os.system(cmd) cmd = "primer3_core -output=%s < %s" % (os.path.join( temp_file_loc, mRNA.id + '.primer3_detailed_output.txt' ), os.path.join(temp_file_loc, mRNA.id + '.primer3_input.txt')) print >> sys.stderr, "\t ", cmd os.system(cmd) ## Parse primer3 default output pri3_results = parse_primer3_detailed_output( os.path.join(temp_file_loc, mRNA.id + '.primer3_detailed_output.txt')) if (not 'PRIMER_PAIR_NUM_RETURNED' in pri3_results) or (int( pri3_results['PRIMER_PAIR_NUM_RETURNED']) == 0): print >> sys.stderr, "\t No primer pairs found for mRNA: %s " % mRNA.id continue ### Generate fasta files of primers Fas_output = open( os.path.join(temp_file_loc, mRNA.id + '.primer3_output.fas'), 'w') for i in xrange(int(pri3_results['PRIMER_PAIR_NUM_RETURNED'])): print >> Fas_output, ">%s\n%s" % ( 'PRIMER_LEFT_' + str(i), pri3_results['PRIMER_LEFT_' + str(i) + '_SEQUENCE']) print >> Fas_output, ">%s\n%s" % ( 'PRIMER_RIGHT_' + str(i), pri3_results['PRIMER_RIGHT_' + str(i) + '_SEQUENCE']) Fas_output.close() print >> sys.stderr, "## RUNNING Blastn Vs genome ##" cmd = "blastn -db %s -query %s -outfmt 6 -out %s %s" % ( genome, os.path.join(temp_file_loc, mRNA.id + '.primer3_output.fas'), os.path.join(temp_file_loc, mRNA.id + '.primers.g.blastn_output.tsv'), Blastn_extra_params) print >> sys.stderr, "\t ", cmd os.system(cmd) genome_blastn_out_dict = parse_blastn_tab_output( os.path.join(temp_file_loc, mRNA.id + '.primers.g.blastn_output.tsv')) print >> sys.stderr, "## RUNNING Blastn Vs Transcripts ##" cmd = "blastn -db %s -query %s -outfmt 6 -out %s %s" % ( transcripts_seq, os.path.join(temp_file_loc, mRNA.id + '.primer3_output.fas'), os.path.join(temp_file_loc, mRNA.id + '.primers.t.blastn_output.tsv'), Blastn_extra_params) print >> sys.stderr, "\t ", cmd os.system(cmd) transcript_blastn_out_dict = parse_blastn_tab_output( os.path.join(temp_file_loc, mRNA.id + '.primers.t.blastn_output.tsv')) ## Final output output = open(mRNA.id + '.primer3_output.tsv', 'w') print >> output, "mRNA\tPRIMER_SERIAL_NO\tLEFT_PRIMER\tLEFT_GC_PERCENT\tLEFT_TM\tLEFT_HAIRPIN_TH\tLEFT_END_STABILITY\tRIGHT_PRIMER\tRIGHT_GC_PERCENT\tRIGHT_TM\tRIGHT_HAIRPIN_TH\tRIGHT_END_STABILITY\tLEFT_Genome_BLASTN_HITS\tRIGHT_Genome_BLASTN_HITS\tLEFT_transcrpt_BLASTN_HITS\tRIGHT_transcript_BLASTN_HITS" for i in xrange(int(pri3_results['PRIMER_PAIR_NUM_RETURNED'])): if not 'PRIMER_LEFT_' + str(i) in genome_blastn_out_dict: genome_blastn_out_dict['PRIMER_LEFT_' + str(i)] = 0 if not 'PRIMER_RIGHT_' + str(i) in genome_blastn_out_dict: genome_blastn_out_dict['PRIMER_RIGHT_' + str(i)] = 0 if not 'PRIMER_LEFT_' + str(i) in transcript_blastn_out_dict: transcript_blastn_out_dict['PRIMER_LEFT_' + str(i)] = 0 if not 'PRIMER_RIGHT_' + str(i) in transcript_blastn_out_dict: transcript_blastn_out_dict['PRIMER_RIGHT_' + str(i)] = 0 out = [ mRNA.id, str(i), pri3_results['PRIMER_LEFT_' + str(i) + '_SEQUENCE'], pri3_results['PRIMER_LEFT_' + str(i) + '_GC_PERCENT'], pri3_results['PRIMER_LEFT_' + str(i) + '_TM'], pri3_results['PRIMER_LEFT_' + str(i) + '_HAIRPIN_TH'], pri3_results['PRIMER_LEFT_' + str(i) + '_END_STABILITY'], pri3_results['PRIMER_RIGHT_' + str(i) + '_SEQUENCE'], pri3_results['PRIMER_RIGHT_' + str(i) + '_GC_PERCENT'], pri3_results['PRIMER_RIGHT_' + str(i) + '_TM'], pri3_results['PRIMER_RIGHT_' + str(i) + '_HAIRPIN_TH'], pri3_results['PRIMER_RIGHT_' + str(i) + '_END_STABILITY'], str(genome_blastn_out_dict['PRIMER_LEFT_' + str(i)]), str(genome_blastn_out_dict['PRIMER_RIGHT_' + str(i)]), str(transcript_blastn_out_dict['PRIMER_LEFT_' + str(i)]), str(transcript_blastn_out_dict['PRIMER_RIGHT_' + str(i)]) ] print >> output, "\t".join(out) output.close() print >> sys.stderr, "## Ploting " draw_primers(exon_array, mRNA.id) target_mRNAs.close() genome_idx.close()
def change_biopython_record_sequence(record, new_seq): """Return a version of the record with the sequence set to new_seq""" new_record = deepcopy(record) new_record.seq = Seq(new_seq, alphabet=DNAAlphabet()) return new_record
def __init__(self, id, name, sequence, *args, **kwargs): Gene.__init__(self, id, name, *args, **kwargs) self.sequence = Seq(sequence, DNAAlphabet()) self._rna = '' self._peptide = ''