def import_dir(dir): assert os.path.isdir(dir) family_name = os.path.basename(os.path.abspath(dir)) runtime().debug("Family Name", family_name) join = lambda *x: os.path.join(dir, *x) # Define the file names align_file = join("FAMILY.index") align_colcull_log = join("colcull.log") align_seqcull_log = join("seqcull.log") tree_file = join("oid.index.reroot") diagchars_file = join("diag.chars") from hpf.utilities import find codeml_file = join("%s_colcull.lrt" % family_name) print codeml_file, os.path.exists(codeml_file) codeml_file = codeml_file if os.path.exists(codeml_file) else None # Make sure all of the files exist for file in [tree_file, align_file, codeml_file, align_colcull_log, align_seqcull_log]: if file: assert os.path.exists(file), file runtime().debug(family_name, align_file, tree_file, codeml_file) oid = OIDImporter( familyName=family_name, treeFile=tree_file, treeDiagCharsFile=diagchars_file, alignFile=align_file, alignColcullLog=align_colcull_log, alignSeqcullLog=align_seqcull_log, codemlFile=codeml_file, alignFormat="fasta", oid_key=oid_key, ) oid.merge()
def pubmed(gi,ids,query): """ Get the pubmed articles listed by *ids """ _ids=",".join(ids) for id in ids: handle = efetch(db="pubmed",id=id,retmode='xml',rettype='xml',retmax=MAX_RETURN) try: #print handle.read() results = eread(handle) for citation in results: #runtime().debug(citation.keys()) citation = citation['MedlineCitation'] pmid = citation['PMID'] article = citation['Article'] title = article['ArticleTitle'] journal = article['Journal']['Title'] try: date = citation['DateCompleted'] if citation.has_key('DateCompleted') else citation['DateCreated'] year = date['Year'] month = date['Month'] day = date['Day'] datetime = "%s-%s-%s" % (year,month,day) except: datetime = '0000-00-00' runtime().debug("Parsed pmid:%s" % id) yield Citation(gi, pmid, title, journal, datetime, query) except: runtime().debug("Failure fetching pmid:%s" % id) continue finally: handle.close()
def run(self, twopass=True): mtx = self.options._mtx() output = self.options.output psipred = subprocess.Popen("which psipred", stdout=subprocess.PIPE, shell=True).communicate()[0] # Cut the /bin/psipred off bin_dir = os.path.split(psipred)[0] root_dir = os.path.split(bin_dir)[0] data = os.path.join(root_dir,"data") if self.options.single: cmd = """psipred %s %s/weights_s.dat %s/weights_s.dat2 %s/weights_s.dat3 > %s""" % (mtx,data,data,data,output) else: cmd = """psipred %s %s/weights.dat %s/weights.dat2 %s/weights.dat3 %s/weights.dat4 > %s""" % (mtx,data,data,data,data,output) #print cmd runtime().debug(cmd) subprocess.check_call(cmd, shell=True, cwd=self.options.cwd) output2 = self.options.output2 horiz = self.options.horiz cmd = """psipass2 %s/weights_p2.dat 1 1.0 1.0 %s %s > %s""" % (data, output2, output, horiz) runtime().debug(cmd) subprocess.check_call(cmd, shell=True, cwd=self.options.cwd) with open(self.options.horiz) as handle: pred = parse(handle) return pred
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None): """ Return the selected proteins as SeqRecord objects """ query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id from hpf.experiment e join bddb.protein p on e.id=p.experiment_key join ddbCommon.sequence s on p.sequence_key=s.id """ assert experiment!= None or sequence_key != None if experiment != None or filter_experiments==True or sequence_key != None: query += " where " if experiment: if not hasattr(experiment, "__iter__"): experiment = [experiment] query += " e.id in (%s)" % (",".join([str(key) for key in experiment])) if filter_experiments: t = " e.taxonomy_id!=0" query += " and "+t if experiment else t if sequence_key: t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key])) query += " and "+t if experiment or filter_experiments else t runtime().debug(query) cursor.execute(query) runtime().debug("Fetching") for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall(): record = SeqRecord(Seq(sequence), str(id), description=e_name) record.annotations = {"taxonomy_id":taxonomy_id, "experiment_key":e_id, "organism":e_name} yield record
def _mtx(self, name=None): """ echo $tmproot.chk > $tmproot.pn echo $tmproot.fasta > $tmproot.sn $ncbidir/makemat -P $tmproot or $execdir/seq2mtx $1 > $tmproot.mtx """ if not name: name = ".".join(os.path.basename(self.profile).split(".")[:-1]) mtx = "%s.mtx" %name # Either generate the matrix from a profile or sequence alone if self.single: cmd = "seq2mtx %s > %s" % (self.fasta, mtx) #print cmd runtime().debug(cmd) subprocess.check_call(cmd, shell=True, cwd=self.cwd) else: for file,link in [(self.profile,name+".pn"),(self.fasta,name+".sn")]: subprocess.check_call("echo %s > %s" % (file,link), shell=True, cwd=self.cwd) makemat = subprocess.Popen("which makemat", shell=True, stdout=subprocess.PIPE).communicate()[0].strip() cmd = "%s -P %s" % (makemat, name) #print cmd runtime().debug(cmd) subprocess.check_call(cmd,shell=True, cwd=self.cwd) return mtx
def parse(self, handle): start = False for line in handle: if line.strip().startswith("Positively Selected Sites : Model 8"): start = True continue if not start: continue if line.startswith("Common"): break pieces = line.strip().split() if len(pieces) != 6 or not pieces[0].isdigit(): continue runtime().debug("codeml", line) column, aa, pr, pm, _pm_, se = pieces # Indices start at 1 in CodeML, convert to 0 column = column - 1 assert column >= 0 pr = pr.split("*")[0] original_column = self.mapper[column] from hpf.hddb.db import PositiveSelection ps = PositiveSelection(codeml_key=self.codeml.id, column=original_column, probability=pr, post_mean=pm, stderr=se) yield ps
def prep(prediction_code): runtime().debug("Running on",prediction_code) with PrepPredictionCode(prediction_code, runtime().opt(SCRATCH), runtime().opt(HPF1_DECOYS), runtime().opt(HPF2_DECOYS)) as p: return p()
def main(file, fam): global family, session session = Session() # if isinstance(fam, basestring): # f = Family() # f.name = fam # f.experiment_key = 0 # session.merge(f) # session.flush() # family = f.id # else: family = int(fam) #runtime().set_debug(1) runtime().debug("Using file", file) with open(file) as handle: from hpf.amnh.oid import phylip records = phylip(handle)._records runtime().debug("Found %i records" % len(records)) from hpf.amnh.oid import index index(records) records = map(merge, records) session.commit() session.close()
def names(gi): columns = ["protgi","gi"] tables = [("accession",0), #("bind",0), ("codedby",0), ("genbank",0), #("geneid",0), ("genename",0), #("kegg",0), ("locustag",0), ("pdb",0), ("pfam",0), ("unigene",0), ("uniprotkb",0)] query = "select * from %s where %s=%i" all = set() with MySQLdb.connect(db="synonyms3",host="err.bio.nyu.edu",user="******",passwd="patrick_nyu") as cursor: for name, col in tables: table = "refseq_%s" % name column = columns[col] q = query % (table,column,gi) runtime().debug(q) cursor.execute(q) rows = cursor.fetchall() for row in rows: if row[1]!=None: all.add(str(row[1])) return all
def import_dir(dir): assert os.path.isdir(dir) family_name = os.path.basename(os.path.abspath(dir)) runtime().debug("Family Name", family_name) join = lambda *x: os.path.join(dir, *x) # Define the file names align_file = join("FAMILY.index") align_colcull_log = join("colcull.log") align_seqcull_log = join("seqcull.log") tree_file = join("oid.index.reroot") diagchars_file = join("diag.chars") from hpf.utilities import find codeml_file = join("%s_colcull.lrt" % family_name) print codeml_file, os.path.exists(codeml_file) codeml_file = codeml_file if os.path.exists(codeml_file) else None # Make sure all of the files exist for file in [ tree_file, align_file, codeml_file, align_colcull_log, align_seqcull_log ]: if file: assert os.path.exists(file), file runtime().debug(family_name, align_file, tree_file, codeml_file) oid = OIDImporter(familyName=family_name, treeFile=tree_file, treeDiagCharsFile=diagchars_file, alignFile=align_file, alignColcullLog=align_colcull_log, alignSeqcullLog=align_seqcull_log, codemlFile=codeml_file, alignFormat="fasta", oid_key=oid_key) oid.merge()
def run(self, ): mtx = self.options._mtx() output = self.options.output psipred = subprocess.Popen("which psipred", stdout=subprocess.PIPE, shell=True).communicate()[0] # Cut the /bin/psipred off bin_dir = os.path.split(psipred)[0] root_dir = os.path.split(bin_dir)[0] data = os.path.join(root_dir,"data") # Single run (from runpsipred_single script): #psipred $tmproot.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $rootname.ss # Normal run (from runpsipred script): #psipred $tmproot.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $rootname.ss cmd = "psipred {0} {1}/weights.dat {1}/weights.dat2 {1}/weights.dat3 > {2}".format(mtx, data, output) runtime().debug(cmd) subprocess.check_call(cmd, shell=True, cwd=self.options.cwd) output2 = self.options.output2 horiz = self.options.horiz # psipass2 command (from runpsipred script): #psipass2 $datadir/weights_p2.dat 1 1.0 1.0 $rootname.ss2 $rootname.ss > $rootname.horiz cmd = "psipass2 {0}/weights_p2.dat 1 1.0 1.0 {1} {2} > {3}".format(data, output2, output, horiz) runtime().debug(cmd) subprocess.check_call(cmd, shell=True, cwd=self.options.cwd) # Note: Output format between Psipred versions 2.5 and 3.2 is the same (woo) with open(self.options.horiz) as handle: pred = parse(handle) return pred
def parse(self, handle): start = False for line in handle: if line.strip().startswith("Positively Selected Sites : Model 8"): start = True continue if not start: continue if line.startswith("Common"): break pieces = line.strip().split() if len(pieces) != 6 or not pieces[0].isdigit(): continue runtime().debug("codeml", line) column, aa, pr, pm, _pm_, se = pieces # Indices start at 1 in CodeML, convert to 0 column = column - 1 assert column >= 0 pr = pr.split("*")[0] original_column = self.mapper[column] from hpf.hddb.db import PositiveSelection ps = PositiveSelection( codeml_key=self.codeml.id, column=original_column, probability=pr, post_mean=pm, stderr=se ) yield ps
def run(self): import subprocess,os with self.ctl as ctl_file: cmd ="codeml "+ctl_file runtime().debug(cmd) subprocess.check_call(cmd,shell=True,cwd=self.dir,stdout=open(os.devnull,"w")) with open(os.path.join(self.dir,"rst")) as handle: return CodeMLParser().parse(handle)
def tasks(): query = "select distinct gi from domain_sccs d join sequenceAc a on d.parent_sequence_key=a.sequence_key where ac is not NULL and d.sccs is not NULL and d.domain_type !='psiblast'" with MySQLdb.connect(host="127.0.0.1",passwd="patrick_nyu",db="hpf") as cursor: runtime().debug(query) cursor.execute(query) runtime().debug("Fetching") all = cursor.fetchall() return [gi[0] for gi in all]
def tasks(*args): if len(args) == 0: assert False #tasks = [os.path.join(dir,dir+".fas") for dir in os.listdir(os.getcwd())] else: tasks = args runtime().debug("TASKS", tasks) return tasks
def tasks(*args): if len(args) == 0: assert False #tasks = [os.path.join(dir,dir+".fas") for dir in os.listdir(os.getcwd())] else: tasks = args runtime().debug("TASKS",tasks) return tasks
def import_family(file): name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_"," ") debug(name, file) session = Session() family = session.query(Family).filter(Family.name==name).first() if family==None: family = Family() family.name = name family.experiment_key = E_ID session.add(family) session.flush() with open(file) as handle: for record in SeqIO.parse(handle,"fasta"): seq = str(record.seq).replace("*","") if not all([c in string.ascii_uppercase for c in seq]): runtime().println("Malformed Sequence", Family, seq) sha1 = hashlib.sha1(seq).hexdigest() sequence = session.query(Sequence).filter(Sequence.sha1==sha1).first() if sequence==None: sequence = Sequence() sequence.sha1 = sha1 sequence.sequence=seq id = session.add(sequence) session.flush() debug("Added",sequence) sequenceAc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==sequence.id, SequenceAc.ac==record.id)).first() if sequenceAc==None: sequenceAc = SequenceAc() sequenceAc.sequence_key=sequence.id sequenceAc.gi = None sequenceAc.db = "amnh" sequenceAc.ac = record.id sequenceAc.description = "amnh" sequenceAc.taxonomy_id = 0 session.add(sequenceAc) session.flush() protein = session.query(Protein).filter(and_(Protein.sequence_key==sequence.id,Protein.experiment_key==E_ID)).first() if protein==None: protein=Protein() protein.experiment_key=E_ID protein.protein_type="phylogeny" protein.sequence_key = sequence.id protein.probability = 0 protein.comment = "auto added amnh families" protein.file_key = 0 protein.parse_key = 0 protein.gene_key = 0 session.add(protein) session.flush() debug("Added",protein) if not family in sequence.families: sequence.families.append(family) session.flush() session.commit() session.close() debug("closed") return None
def main(*args): runtime().set_debug(1) pool = processor(synchronous=runtime().opt(SYNCHRONOUS), raise_errors=False) runtime().debug("Using processor",pool) pool.make_tasks(tasks) mcm_tasks = [t for t in pool.run(prep) if t !=None] import cPickle with open("exported.pickle","w") as handle: cPickle.dump(mcm_tasks,handle)
def main(*args): # calls processor with a the keyword argument 'synchronous'. runtime() creates a new Runtime obj. or accesses # the existing runtime object, and .opt(SYNCHRONOUS) returns the value of the synchronous option of the Runtime # object. # processor returns a Map, SGEArray, or PBSArrayProcessor object. NOTE: currently, PBSArrayProcessor is not fully implemented. pool = processor(synchronous=runtime().opt(SYNCHRONOUS)) runtime().debug("Using processor",pool) pool.make_tasks(None) consume(pool.run(None))
def find_num_processors(): """Utility function to find number of processors for multiprocessing""" from numpy.distutils import cpuinfo if isinstance(cpuinfo.cpu.info,list): num_procs = len(cpuinfo.cpu.info) else: num_procs = int(cpuinfo.cpu.info['sysctl_hw']['hw.availcpu']) runtime().debug("Auto-discovered %i processors" % num_procs) return num_procs
def ginzu_svg(sequence_key, width=800): """ Return the vector graphic for GINZU domains. """ import subprocess cmd = "perl %s %i %i" % (os.path.join(SCRIPTS_FOLDER,"svg.pl"), sequence_key, width) runtime().debug(cmd) print cmd svg = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]+"\n" return svg
def _do(self): muscle = Muscle(MuscleOptions(input=self.fasta, output=self.output, maxhours=1, **self.kwargs)) runtime().debug("Performing Muscle w/", self.fasta) out = muscle.run() if self.phylip: from hpf.phylip import clustal_to_phylip return clustal_to_phylip(self.output) else: return self.output
def tasks(decoy_dir, psipred_dir): from hpf.utilities.paths import find runtime().debug("Searching for decoys",decoy_dir) decoys = list(find("[a-z]{2}[0-9]{3}\.((result)|(out))\.gz",dir=decoy_dir)) runtime().debug("Found",len(decoys),"decoys") for decoy in decoys: prediction_code = pred_code(decoy) psipred = os.path.join(psipred_dir,prediction_code,prediction_code+".psipred") if os.path.exists(psipred): yield (decoy,psipred)
def __enter__(self): from hpf.utilities.paths import ensure ensure(self.scratch) self.session = Session() runtime().debug("Loading outfile") self.filesystem_outfile = self.session.query(FilesystemOutfile).filter(FilesystemOutfile.prediction_code==self.prediction_code).first() runtime().debug("Loading sequence") self.sequence = self.session.query(Sequence).get(self.filesystem_outfile.sequence_key) debug(self.prediction_code,self.filesystem_outfile,self.sequence) return self
def main(*files): global session #runtime().set_debug(1) session = Session() for file in files: runtime().debug("Mapping file",file) with open(file) as handle: records = list(SeqIO.parse(handle,"fasta")) runtime().debug("Found %i records" % len(records)) with open(file+".hpf","w") as handle: SeqIO.write(map(rename,records), handle, "fasta")
def tunnel(sleep=None): """ Ensure a tunnel to the uwashinton mysql server is running """ from scripts import TUNNEL if sleep!=None: import time, random time.sleep(random.random()*sleep) runtime().debug(TUNNEL) import subprocess with open(os.devnull) as handle: subprocess.call(TUNNEL,shell=True,stdout=handle,stderr=handle)
def blast(fasta, db="hpf_protein", force=False): """ Map families to the database. """ runtime().debug(fasta) file = os.path.abspath(fasta) base = ".".join(os.path.basename(file).split(".")[:-1]) name = base.replace("_", " ") dir = base subprocess.call("mkdir -p %s" % base, shell=True) runtime().debug(dir, fasta, base) cwd = os.getcwd() os.chdir(dir) try: raise Exception( "This has been modified like crazy, don't run as is, make sure this is correct" ) #runtime().pushd(dir) #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force) # Cluster the families representatives before blasting everything to HPF #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run() #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force) #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force) #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force) #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force) #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force) #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force) blast_matches = base + ".hpf.fasta" session = Session() family = session.query(Family).filter(Family.name == name).one() debug(family) with open(blast_matches) as handle: for record in SeqIO.parse(handle, "fasta"): map = session.query(FamilySequence).filter( and_(FamilySequence.family_key == family.id, FamilySequence.sequence_key == int( record.id))).first() if map == None: map = FamilySequence() map.family_key = family.id map.sequence_key = int(record.id) debug(map) session.add(map) session.commit() session.close() #runtime().popd() finally: os.chdir(cwd)
def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter(Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close()
def sequences(cursor, sequence_key=None): """ Return the given sequences as SeqRecord objects. """ query = "SELECT id,sequence from sequence" if sequence_key != None: if not hasattr(sequence_key, "__iter__"): sequence_key = [sequence_key] query += " where id in (%s)" % (",".join([str(key) for key in sequence_key])) runtime().debug(query) cursor.execute(query) runtime().debug(query) for id, sequence in cursor.fetchall(): yield SeqRecord(Seq(sequence), str(id))
def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter( Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close()
def _tree(self): session = self.session # # Load the tree file and rename the taxa. # from Bio.Nexus.Nexus import Nexus # nex=Nexus(self.treeFile) # self.nexus = nex.trees[0] from Bio.Nexus.Trees import Tree as NewickTree tree_str = open(self.treeFile).read() self.nexus = NewickTree(tree_str) # Rename all the taxa. for id in self.nexus.get_terminals(): node = self.nexus.node(id) node.data.taxon = self._index(node.data.taxon) # Create the DB object from hpf.hddb.db import Tree self.tree = Tree( alignment_key=self.alignment.id, text=self.nexus.to_string(plain=False, plain_newick=True), filename=self.treeFile, ) session.add(self.tree) session.flush() # Now add in the node references self.nexus.name = self.tree.id assert self.tree.id != None runtime().debug("Added tree", self.tree) from hpf.hddb.db import TreeNodeFactory nodes = list(TreeNodeFactory().create(self.nexus)) for node in nodes: node.ancestor_node = node.ancestor.id if node.ancestor else None # This should add the new object into the session self.tree.nodes.append(node) # session.add(node) session.flush() runtime().debug("Appended", len(nodes), "tree nodes") session.flush() # Now import the diagnostic characters and reference the nodes. from hpf.amnh.oid import DiagCharsParser from hpf.hddb.db import TreeFactory biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id) parser = DiagCharsParser(biotree) runtime().debug(self.treeDiagCharsFile) with open(self.treeDiagCharsFile) as handle: diagchars = list(parser.parse(handle)) runtime().debug("DiagChars", len(diagchars)) for d in diagchars: session.add(d) session.flush()
def merge(selection): selection.family_key = family selection.tree_key = tree # try: # s.sequence_key = int(record.id) # except: # seq = session.query(Sequence).filter(Sequence.sha1==sha1(str(record.seq).replace("-","")).hexdigest()).first() # if seq: # s.sequence_key = seq.id # else: # runtime().debug("Can't find sequence_key for",record.id, str(record.seq).replace("-","")) # return selection = session.merge(selection) runtime().debug("Merged", selection) return selection
def create(self, seed, targets, format="fasta", **kwargs): """ Performs a default Mafft seed alignment using the seed and target @return: Seed alignment object """ output = NamedTemporaryFile(**kwargs) with TemporaryAlignmentFile([seed], format=format, **kwargs) as seed_file: with TemporaryRecordFile(targets, format=format, **kwargs) as target_file: cmd = "mafft-linsi --quiet --seed %s %s > %s" % (seed_file,target_file, output.name)#"test.txt" runtime().debug(cmd) subprocess.check_call(cmd,shell=True) #print open(output.name).read() with open(output.name) as handle: alignment = AlignmentFactory(self._class).read(handle,format) return alignment
def blast(fasta,db="hpf_protein",force=False): """ Map families to the database. """ runtime().debug(fasta) file = os.path.abspath(fasta) base = ".".join(os.path.basename(file).split(".")[:-1]) name = base.replace("_"," ") dir = base subprocess.call("mkdir -p %s"%base, shell=True) runtime().debug(dir,fasta,base) cwd = os.getcwd() os.chdir(dir) try: raise Exception("This has been modified like crazy, don't run as is, make sure this is correct") #runtime().pushd(dir) #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force) # Cluster the families representatives before blasting everything to HPF #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run() #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force) #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force) #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force) #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force) #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force) #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force) blast_matches = base+".hpf.fasta" session = Session() family = session.query(Family).filter(Family.name==name).one() debug(family) with open(blast_matches) as handle: for record in SeqIO.parse(handle, "fasta"): map = session.query(FamilySequence).filter(and_(FamilySequence.family_key==family.id, FamilySequence.sequence_key==int(record.id))).first() if map == None: map = FamilySequence() map.family_key=family.id map.sequence_key = int(record.id) debug(map) session.add(map) session.commit() session.close() #runtime().popd() finally: os.chdir(cwd)
def main(fam, t, *files): global family, tree, session session = Session() family = int(fam) tree = int(t) for file in files: #runtime().set_debug(1) runtime().debug("Using file", file) with open(file) as handle: ps = PositiveSelectionParser().parse(handle) count = consume(imap(merge, ps)) runtime().debug("Found", count, "sites") session.commit() session.close()
def xml(id): from hpf.hddb.db import Session, Family session = Session() family = session.query(Family).get(id) filename = "%i.xml" % family.id if runtime().opt(GZIP): import gzip filename = "%s.gz" % filename handle = gzip.open(filename,"w") else: handle = open(filename,"w") try: doc = FamilyFeatureBuilder( lambda: DefaultXMLGenerator(handle,pretty=True), lambda handler: StructureFeatureProvider(handler), lambda handler: ColumnFeatureProvider(handler), lambda handler: IeaFeatureProvider(handler), lambda handler: SelectionFeatureProvider(handler) ) doc.buildDocument(family) finally: handle.close() session.close()
def add(record): record._digest = digest(record) z = session.query(Sequence).filter(Sequence.sha1==record._digest).first() if not z: z = Sequence(sequence = str(record.seq), sha1= record._digest) session.add(z) session.flush() runtime().debug("Added sequence", record.id, z.id) else: runtime().debug("Found sequence", record.id, z.id) record._hddb = z record = ac(record) record = protein(record) record.id = str(record._ac.protein_key) return record
def _do(argv): r = runtime() r.description(""" template.py [-options] args template script. """) r.add_option( Flag(SYNCHRONOUS, "s", description= "Run this script synchronously without any multi-processing")) r.add_option( Flag(NEXUS, "n", description="The input files are in NEXUS format.")) r.add_option( IntegerOption( MAX_SIZE, "l", description="Maximum number of leaves in the trees. Default:40", default=40)) r.add_option( FileOption( DIRECTORY, "o", description="Directory to export to. Default to dirname(tree_file)", default=None)) args = r.parse_options(argv) main(*args)
def split(tree_file, size, nexus=False, dir=None): print file, size if nexus: tree = Nexus(tree_file).trees[0] tree2 = Nexus(tree_file).trees[0] else: with open(tree_file) as handle: tree_str = handle.read() tree = Tree(tree_str) tree2 = Tree(tree_str) # with open(align_file) as handle: # alignment = AlignIO.read(handle, "phylip") splitter = TreeSplitter(tree, max_size=size, annotater=UnrootedShortestPath) subs = list(splitter.subtrees()) runtime().debug("Found", len(subs), subs) dir = dir if dir else os.path.dirname(tree_file) for i, tree in enumerate(subs): nodes = [tree.node(node) for node in tree.all_ids()] taxa = set( [node.data.taxon for node in nodes if node.data.taxon != None]) for terminal in tree2.get_terminals(): node = tree2.node(terminal) if node.data.taxon in taxa: node.data.taxon = "%i-" % i + node.data.taxon # sub_taxa = tree.get_taxa() # sub_alignment = Alignment(alphabet=alignment._alphabet) # sub_alignment._records = [r for r in alignment._records if r.id in sub_taxa] # assert len(sub_taxa)==len(sub_alignment._records) ## align_out = "%s.%i" % (os.path.join(dir,os.path.basename(align_file)),i) # with open(align_out,"w") as handle: # AlignIO.write([sub_alignment], handle, "phylip") # from hpf.phylip import interleave # interleave(align_out) with open( "%s.%i" % (os.path.join(dir, os.path.basename(tree_file)), i), "w") as handle: print >> handle, tree.to_string(plain_newick=True, branchlengths_only=False) + ";" with open("%s.annotated" % os.path.join(dir, os.path.basename(tree_file)), "w") as handle: print >> handle, tree2.to_string(plain_newick=True, branchlengths_only=False) + ";"
def _do(argv): r = runtime() r.description(""" map_fasta.py [-options] fasta Rename all of the records with sequence id's from the database. """) args = r.parse_options(argv) main(*args)
def _do(argv): r = runtime() r.description(""" import_proteome.py [-options] fasta experiment_id template script. """) args = r.parse_options(argv) main(*args)
def _tree(self): session = self.session # # Load the tree file and rename the taxa. # from Bio.Nexus.Nexus import Nexus # nex=Nexus(self.treeFile) # self.nexus = nex.trees[0] from Bio.Nexus.Trees import Tree as NewickTree tree_str = open(self.treeFile).read() self.nexus = NewickTree(tree_str) # Rename all the taxa. for id in self.nexus.get_terminals(): node = self.nexus.node(id) node.data.taxon = self._index(node.data.taxon) # Create the DB object from hpf.hddb.db import Tree self.tree = Tree(alignment_key=self.alignment.id, text=self.nexus.to_string(plain=False, plain_newick=True), filename=self.treeFile) session.add(self.tree) session.flush() # Now add in the node references self.nexus.name = self.tree.id assert self.tree.id != None runtime().debug("Added tree", self.tree) from hpf.hddb.db import TreeNodeFactory nodes = list(TreeNodeFactory().create(self.nexus)) for node in nodes: node.ancestor_node = node.ancestor.id if node.ancestor else None # This should add the new object into the session self.tree.nodes.append(node) #session.add(node) session.flush() runtime().debug("Appended", len(nodes), "tree nodes") session.flush() # Now import the diagnostic characters and reference the nodes. from hpf.amnh.oid import DiagCharsParser from hpf.hddb.db import TreeFactory biotree = TreeFactory(name_func=lambda node: str(node.id)).create( self.tree.nodes, self.tree.id) parser = DiagCharsParser(biotree) runtime().debug(self.treeDiagCharsFile) with open(self.treeDiagCharsFile) as handle: diagchars = list(parser.parse(handle)) runtime().debug("DiagChars", len(diagchars)) for d in diagchars: session.add(d) session.flush()
def create(self, seed, targets, format="fasta", **kwargs): """ Performs a default Mafft seed alignment using the seed and target @return: Seed alignment object """ output = NamedTemporaryFile(**kwargs) with TemporaryAlignmentFile([seed], format=format, **kwargs) as seed_file: with TemporaryRecordFile(targets, format=format, **kwargs) as target_file: cmd = "mafft-linsi --quiet --seed %s %s > %s" % ( seed_file, target_file, output.name) #"test.txt" runtime().debug(cmd) subprocess.check_call(cmd, shell=True) #print open(output.name).read() with open(output.name) as handle: alignment = AlignmentFactory(self._class).read(handle, format) return alignment
def _do(argv): r = runtime() r.description(""" oid.py [-options] *oid-directory Import an OID family. """) #r.add_option(Flag(SYNCHRONOUS, "s", description="Run this script synchronously without any multi-processing")) args = r.parse_options(argv) main(*args)
def merge(record): s = FamilySequence() s.family_key = family s.sequence_key = int(record.id) # try: # s.sequence_key = int(record.id) # except: # seq = session.query(Sequence).filter(Sequence.sha1==sha1(str(record.seq).replace("-","")).hexdigest()).first() # if seq: # s.sequence_key = seq.id # else: # runtime().debug("Can't find sequence_key for",record.id, str(record.seq).replace("-","")) # return s.alignment = str(record.seq) s.seed = True s = session.merge(s) runtime().debug("Merged", s.sequence_key, s.family_key, s.alignment) return s
def _do(argv): r = runtime() r.description(""" import_codeml.py [-options] familykey treekey *codeml_output Import codeml positive selection output. """) #r.add_option(Option(FORMAT, "f", description="Alignment Format", default="fasta")) args = r.parse_options(argv) main(*args)
def blast(self, fasta, output): """ Blast the fasta, consume the output buffer, return the output filename """ runtime().debug("Blasting %s with alignment %s using %s" % (fasta, self.alignment, self.blast_exe)) r, e = NCBIStandalone.blastpgp(self.blast_exe, self.db, fasta, align_infile=self.alignment, align_outfile=output, expectation=self.expect, model_threshold=self.expect, npasses=3, nprocessors=1, **self.kwargs) consume(r) return output
def main(file, experiment_id): global fasta, session, experiment #runtime().set_debug(1) runtime().debug("Using file",file) session = Session() fasta = os.path.basename(file) experiment = session.query(Experiment).get(experiment_id) with open(file) as handle: records = list(SeqIO.parse(handle,"fasta")) runtime().debug("Found %i records" % len(records)) runtime().indent() records = map(add,records) session.close() runtime().unindent() runtime().debug("Writing %i records" % len(records)) with open(file+".hpf","w") as handle: SeqIO.write(records, handle, "fasta")
def _do(argv): r = runtime() r.description(""" import_alignment.py [-options] fasta family Import an alignment and attach to a family. """) r.add_option( Option(FORMAT, "f", description="Alignment Format", default="fasta")) args = r.parse_options(argv) main(*args)
def ac(record): acc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==record._hddb.id, SequenceAc.db=='amnh', SequenceAc.ac==record.id)).first() if not acc: acc = SequenceAc(sequence_key=record._hddb.id, gi = None, db = 'amnh', ac = record.id, ac2 = None, description = fasta, taxonomy_id = experiment.taxonomy_id) session.add(acc) session.flush() runtime().debug("Added ac", acc) else: runtime().debug("Found ac", acc) record._ac = acc return record
def protein(record): if record._ac.protein_key == None: protein = Protein(experiment_key=experiment.id, protein_type='amnh', sequence_key=record._hddb.id, probability=0, comment="AMNH protein families", file_key=0, parse_key=0, gene_key=0, insert_data=func.CURRENT_DATE() ) session.add(protein) session.flush() record._ac.protein_key = protein.id # record._ac = session.merge(record._ac) # record._ac.protein_key = protein.id runtime().debug("Modified",session.is_modified(record._ac)) session.add(record._ac) record._protein = protein runtime().debug("Added protein",record._protein) else: record._protein = session.query(Protein).get(record._ac.protein_key) runtime().debug("Found protein key",record._protein) return record
def _do(argv): r = runtime() r.description(""" import_tree.py [-options] tree_file family_key Import a tree into the database. """) r.add_option( Flag(SYNCHRONOUS, "s", description= "Run this script synchronously without any multi-processing")) args = r.parse_options(argv) main(*args)
def _do(argv): r = runtime() r.description(""" template.py [-options] args template script. """) r.add_option( Flag(SYNCHRONOUS, "s", description= "Run this script synchronously without any multi-processing")) args = r.parse_options(argv) main(*args)
def _do(argv): r = runtime() r.description(""" export_xml.py [-options] args Exports full structure/family reports for families to xml files. """) r.add_option( Flag(SYNCHRONOUS, "s", description= "Run this script synchronously without any multi-processing")) r.add_option(Flag(GZIP, "z", description="GZip the resulting files.")) args = r.parse_options(argv) main(*args)
def run(self,force=False): output = self.output if not hasattr(output,'__iter__'): output = [output] which = [os.path.exists(f) for f in output] if not all(which) or force==True: try: r = self._do() return r if r != None else self.output except: #print >>sys.stderr, "failure. deleting ",self.output delete = [self.output] if isinstance(self.output, str) else self.output for file in delete: try: if os.path.exists(file): os.remove(file) except: print >>sys.stderr, "cannot remove file ",file pass raise else: runtime().debug("Output exists, skipping",[f for i,f in enumerate(output) if which[i]==True]) return self._exists()
def _do(argv): r = runtime() r.description(""" families.py [-options] args Blast protein famlies against HPF database and gather representatives. """) r.add_option( Flag(SYNCHRONOUS, "s", description= "Run this script synchronously without any multi-processing")) r.add_option(Flag(FORCE, "f", description="Force re-running of all tasks.")) args = r.parse_options(argv) main(*args)
def main(*args): pool = processor(synchronous=runtime().opt(SYNCHRONOUS)) runtime().debug("Using processor", pool) pool.make_tasks(lambda: zip( args, # [args[i] for i in range(0,len(args),2)], # [args[i] for i in range(1,len(args),2)], repeat(runtime().opt(MAX_SIZE), len(args)), repeat(runtime().opt(NEXUS), len(args)), repeat(runtime().opt(DIRECTORY), len(args)))) consume(pool.run(_split))