def piechart(self, domain_type): pie_file = self.output[0] pylab.figure(figsize=(5, 5)) probs, labels = ([], []) for type in DOMAIN_TYPES: mean_coverage = mean( domain_type[type]) if len(domain_type[type]) > 0 else 0.0 debug(self.family_name, type, mean_coverage) probs.append(mean_coverage) labels.append(type) #+":%0.2f" % mean_coverage) #ax = pylab.axes([0.6, 0.6, 0.4, 0.4]) explode = [0.05 for i in xrange(len(DOMAIN_TYPES))] patches, texts = pylab.pie(probs, explode=None, labels=None, shadow=False, colors=DOMAIN_COLORS) pylab.figlegend(patches, labels, "lower left", fancybox=True, markerscale=0.2) pylab.title(self.family_name) pylab.savefig(pie_file)
def do_alignment(cl,parse=True): command = str(cl) if cl.tempd == None: cwd = tempfile.mkdtemp() else: cwd = cl.cwd debug("Running mammoth",command) p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,stderr=open(os.devnull,"w"),cwd=cwd) if not parse: return p.communicate() # Now parse output if cl.output: # Consume the process' pipe so we don't block for l in p.stdout: pass parse_output = open(cl.output) else: parse_output = p.stdout ini_psi, ini_rms, end_psi, end_rms, zscore, evalue, pred_seq, pred_ss, exp_seq, exp_ss = _parse_mammoth(parse_output) # Wait for process to finish rcode = p.wait() if rcode != 0: raise subprocess.CalledProcessError(command,rcode) mm = MammothAlignment(ini_rms=ini_rms, ini_psi=ini_psi, end_rms=end_rms, end_psi=end_psi, zscore=zscore, evalue=evalue, pred_seq=pred_seq, pred_ss=pred_ss, exp_seq=exp_seq, exp_ss=exp_ss) mm._cwd = cwd return mm
def thread_setup(): global Session if Session == None: clear() e, b, s = setup() debug("Entering subprocess setup", s, Session) Session = s
def parse(self,output): """ Parse the Blast handle and append records to the output file """ with open(output) as handle: runtime().debug("Parsing blast results") blast_records = NCBIXML.parse(handle) matches = set() for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: n = float(min(hsp.align_length,blast_record.query_letters)) m = float(max(hsp.align_length,blast_record.query_letters)) if n/m < self.align_perc: continue if hsp.expect > self.expect: continue matches.add(alignment.title.split()[0][4:]) runtime().debug("Found %i hits" % len(matches)) if os.path.exists(self.output): with open(self.output) as fasta_file: already_found = set([r.id for r in SeqIO.parse(fasta_file, "fasta")]) debug("Found %i already" % len(matches.intersection(already_found))) matches = matches-already_found debug("Found %i new matches"%len(matches)) if len(matches) > 0: with hddb.connect("ddbCommon") as cursor: records = list(hddb.proteins(cursor, sequence_key=matches)) runtime().debug("Formatting blast results into fasta file") with open(self.output,"a") as fasta_file: SeqIO.write(records, fasta_file, "fasta")
def thread_setup(): global Session if Session==None: clear() e,b,s = setup() debug("Entering subprocess setup",s,Session) Session = s
def makeDirs(path): "Raises a PathException if the path cannot be created w/ os.error number as value" from hpf.utilities import debug debug("Ensuring: %s" % path) try: os.makedirs(path) except OSError as e: if e.errno != errno.EEXIST: raise PathException("Cannot make dir", e.errno)
def __enter__(self): from hpf.utilities.paths import ensure ensure(self.scratch) self.session = Session() runtime().debug("Loading outfile") self.filesystem_outfile = self.session.query(FilesystemOutfile).filter(FilesystemOutfile.prediction_code==self.prediction_code).first() runtime().debug("Loading sequence") self.sequence = self.session.query(Sequence).get(self.filesystem_outfile.sequence_key) debug(self.prediction_code,self.filesystem_outfile,self.sequence) return self
def _do(self): for i,fasta in enumerate(self.records()): try: with NamedTemporaryFile(prefix="blast",dir=os.getcwd()) as blast_out: self.blast(fasta,blast_out.name) self.parse(blast_out.name) except Exception, e: debug("Failed to blast/parse %i record in %s" % (i,self.fasta)) debug("\t",e) continue
def _decoy(self): """ Find the decoy file and copy it to the scratch directory. """ assert os.path.exists(self.decoy), self.decoy debug("Found",self.decoy) destination = self.destination shutil.copy(self.decoy, destination) #assert os.path.exists(destination) debug("Copied to",destination) return destination
def find(pattern, dir=os.getcwd()): """Searches a directory path and yields regular expression matches.""" import re regex = re.compile(pattern) for (path, dames, fnames) in os.walk(dir) : for fn in fnames: debug(fn) abs = os.path.abspath(join(path, fn)) match = regex.search(abs) if match: yield abs
def blast(fasta, db="hpf_protein", force=False): """ Map families to the database. """ runtime().debug(fasta) file = os.path.abspath(fasta) base = ".".join(os.path.basename(file).split(".")[:-1]) name = base.replace("_", " ") dir = base subprocess.call("mkdir -p %s" % base, shell=True) runtime().debug(dir, fasta, base) cwd = os.getcwd() os.chdir(dir) try: raise Exception( "This has been modified like crazy, don't run as is, make sure this is correct" ) #runtime().pushd(dir) #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force) # Cluster the families representatives before blasting everything to HPF #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run() #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force) #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force) #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force) #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force) #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force) #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force) blast_matches = base + ".hpf.fasta" session = Session() family = session.query(Family).filter(Family.name == name).one() debug(family) with open(blast_matches) as handle: for record in SeqIO.parse(handle, "fasta"): map = session.query(FamilySequence).filter( and_(FamilySequence.family_key == family.id, FamilySequence.sequence_key == int( record.id))).first() if map == None: map = FamilySequence() map.family_key = family.id map.sequence_key = int(record.id) debug(map) session.add(map) session.commit() session.close() #runtime().popd() finally: os.chdir(cwd)
def _do(self): for i, fasta in enumerate(self.records()): try: with NamedTemporaryFile(prefix="blast", dir=os.getcwd()) as blast_out: self.blast(fasta, blast_out.name) self.parse(blast_out.name) except Exception, e: debug("Failed to blast/parse %i record in %s" % (i, self.fasta)) debug("\t", e) continue
def _db(self): """ Get the prediction from the database and write it to a file. """ debug("Getting psipred from database") from hpf.pdb.psipred import PsipredWriter psipred = self.psipred with open(psipred,"w") as handle: PsipredWriter().write(handle, self.sequence.psipred.psipred, self.sequence.record) #assert os.path.exists(psipred) return psipred
def blast(fasta,db="hpf_protein",force=False): """ Map families to the database. """ runtime().debug(fasta) file = os.path.abspath(fasta) base = ".".join(os.path.basename(file).split(".")[:-1]) name = base.replace("_"," ") dir = base subprocess.call("mkdir -p %s"%base, shell=True) runtime().debug(dir,fasta,base) cwd = os.getcwd() os.chdir(dir) try: raise Exception("This has been modified like crazy, don't run as is, make sure this is correct") #runtime().pushd(dir) #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force) # Cluster the families representatives before blasting everything to HPF #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run() #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force) #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force) #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force) #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force) #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force) #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force) blast_matches = base+".hpf.fasta" session = Session() family = session.query(Family).filter(Family.name==name).one() debug(family) with open(blast_matches) as handle: for record in SeqIO.parse(handle, "fasta"): map = session.query(FamilySequence).filter(and_(FamilySequence.family_key==family.id, FamilySequence.sequence_key==int(record.id))).first() if map == None: map = FamilySequence() map.family_key=family.id map.sequence_key = int(record.id) debug(map) session.add(map) session.commit() session.close() #runtime().popd() finally: os.chdir(cwd)
def import_family(file): name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_"," ") debug(name, file) session = Session() family = session.query(Family).filter(Family.name==name).first() if family==None: family = Family() family.name = name family.experiment_key = E_ID session.add(family) session.flush() with open(file) as handle: for record in SeqIO.parse(handle,"fasta"): seq = str(record.seq).replace("*","") if not all([c in string.ascii_uppercase for c in seq]): runtime().println("Malformed Sequence", Family, seq) sha1 = hashlib.sha1(seq).hexdigest() sequence = session.query(Sequence).filter(Sequence.sha1==sha1).first() if sequence==None: sequence = Sequence() sequence.sha1 = sha1 sequence.sequence=seq id = session.add(sequence) session.flush() debug("Added",sequence) sequenceAc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==sequence.id, SequenceAc.ac==record.id)).first() if sequenceAc==None: sequenceAc = SequenceAc() sequenceAc.sequence_key=sequence.id sequenceAc.gi = None sequenceAc.db = "amnh" sequenceAc.ac = record.id sequenceAc.description = "amnh" sequenceAc.taxonomy_id = 0 session.add(sequenceAc) session.flush() protein = session.query(Protein).filter(and_(Protein.sequence_key==sequence.id,Protein.experiment_key==E_ID)).first() if protein==None: protein=Protein() protein.experiment_key=E_ID protein.protein_type="phylogeny" protein.sequence_key = sequence.id protein.probability = 0 protein.comment = "auto added amnh families" protein.file_key = 0 protein.parse_key = 0 protein.gene_key = 0 session.add(protein) session.flush() debug("Added",protein) if not family in sequence.families: sequence.families.append(family) session.flush() session.commit() session.close() debug("closed") return None
def piechart(self, domain_type): pie_file = self.output[0] pylab.figure(figsize=(5,5)) probs,labels = ([],[]) for type in DOMAIN_TYPES: mean_coverage = mean(domain_type[type]) if len(domain_type[type])>0 else 0.0 debug(self.family_name,type,mean_coverage) probs.append(mean_coverage) labels.append(type)#+":%0.2f" % mean_coverage) #ax = pylab.axes([0.6, 0.6, 0.4, 0.4]) explode = [0.05 for i in xrange(len(DOMAIN_TYPES))] patches, texts = pylab.pie(probs,explode=None,labels=None,shadow=False,colors=DOMAIN_COLORS) pylab.figlegend(patches, labels, "lower left", fancybox=True, markerscale=0.2) pylab.title(self.family_name) pylab.savefig(pie_file)
def annotate_records(fasta): """Attach length of domain_type coverage over each protein in fasta file""" with open(fasta) as handle: records = list(SeqIO.parse(handle, "fasta")) protein = {} for r in records: protein[int(r.id)] = r r.annotations['domain_type'] = defaultdict(lambda: 0) with hddb.connect("ddbCommon") as cursor: query = """select d.parent_sequence_key, d.domain_type, sum(length(sequence)) from domain d join sequence s on d.domain_sequence_key=s.id where parent_sequence_key in (%s) group by domain_type""" % ",".join([str(id) for id in protein.keys()]) debug(query) cursor.execute(query) for protein_key, domain_type, dlen in cursor.fetchall(): debug(len(protein[protein_key].seq), domain_type, dlen) protein[protein_key].annotations['domain_type'][domain_type] = dlen return protein.values()
def annotate_records(fasta): """Attach length of domain_type coverage over each protein in fasta file""" with open(fasta) as handle: records = list(SeqIO.parse(handle, "fasta")) protein = {} for r in records: protein[int(r.id)] = r r.annotations['domain_type'] = defaultdict(lambda: 0) with hddb.connect("ddbCommon") as cursor: query = """select d.parent_sequence_key, d.domain_type, sum(length(sequence)) from domain d join sequence s on d.domain_sequence_key=s.id where parent_sequence_key in (%s) group by domain_type""" % ",".join([str(id) for id in protein.keys()]) debug(query) cursor.execute(query) for protein_key, domain_type, dlen in cursor.fetchall(): debug(len(protein[protein_key].seq),domain_type,dlen) protein[protein_key].annotations['domain_type'][domain_type] = dlen return protein.values()
def parse(self, output): """ Parse the Blast handle and append records to the output file """ with open(output) as handle: runtime().debug("Parsing blast results") blast_records = NCBIXML.parse(handle) matches = set() for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: n = float( min(hsp.align_length, blast_record.query_letters)) m = float( max(hsp.align_length, blast_record.query_letters)) if n / m < self.align_perc: continue if hsp.expect > self.expect: continue matches.add(alignment.title.split()[0][4:]) runtime().debug("Found %i hits" % len(matches)) if os.path.exists(self.output): with open(self.output) as fasta_file: already_found = set( [r.id for r in SeqIO.parse(fasta_file, "fasta")]) debug("Found %i already" % len(matches.intersection(already_found))) matches = matches - already_found debug("Found %i new matches" % len(matches)) if len(matches) > 0: with hddb.connect("ddbCommon") as cursor: records = list(hddb.proteins(cursor, sequence_key=matches)) runtime().debug("Formatting blast results into fasta file") with open(self.output, "a") as fasta_file: SeqIO.write(records, fasta_file, "fasta")
def plot(families, filename="ginzu.svg"): #pylab.figure(figsize=(7,7)) families.sort(cmp=lambda x,y: cmp(x[1]['psiblast'], y[1]['psiblast'])) fp = font.FontProperties(size="x-small") ax = pylab.axes([0.3, 0.0, .6, .7]) ind = arange(len(families)) width = .35 plots = {} domain_dict = {} for type in DOMAIN_TYPES: domain_dict[type] = [] names = [] for name,domains in families: names.append(name) for type in DOMAIN_TYPES: domain_dict[type].append(domains[type]) sum_bottom = [0 for i in arange(len(families))] debug("length of sum_bottom:", len(sum_bottom)) for dkey in DOMAIN_TYPES: #plots[dkey] = pylab.bar(ind, domain_dict[dkey], color=self.color[dkey],bottom=sum_bottom) plots[dkey] = pylab.barh(ind, domain_dict[dkey], color=colors[dkey],left=sum_bottom) sum_bottom = map(sum, zip(sum_bottom,domain_dict[dkey])) #pylab.xticks(ind+width/2, organisms_dict.keys(), rotation=45) pylab.yticks(ind+width/2, names, size='xx-small') pylab.xticks() pylab.title("Ginzu domain frequencies") pylab.legend( [plots[key][0] for key in DOMAIN_TYPES], DOMAIN_TYPES, prop=fp, markerscale=.5, loc=(.85,.85)) #kdrew: a little filename manipulation #f_parts = self.yield_plot_filename.rpartition('.') #filename = f_parts[0] + "_" + self.experiment_key + "." + f_parts[2] pylab.savefig(filename)
def _psipred(self): """ Run psipred on the sequence. """ debug("Running psipred prediction") from hpf.seq import TemporaryRecordFile from hpf.pdb.psipred import Psipred, PsipredOptions from Bio import SeqIO fasta = os.path.join(self.scratch,self.prediction_code,self.prediction_code+".fasta") with open(fasta,"w") as handle: SeqIO.write([self.sequence.record], handle, "fasta") psipred = self.psipred from Bio.Blast.NCBIStandalone import blastpgp chk = fasta+".chk" import subprocess cmd = subprocess.Popen(["which", "blastpgp"], stdout=subprocess.PIPE).communicate()[0].strip() debug("Using",cmd) result,error = blastpgp(cmd, "nr", fasta, npasses=3, checkpoint_outfile=chk, expectation=1e-4, model_threshold=1e-4, align_outfile="/dev/null") debug(result.readlines()) debug(error.readlines()) options = PsipredOptions(fasta, profile=chk, output=psipred+".1", output2=psipred+".2", horiz=psipred, cwd = os.path.join(self.scratch,self.prediction_code)) prediction = Psipred(options).run() db_pred = PsipredFactory().create(prediction,sequence_key=self.sequence.id) self.session.add(db_pred) self.session.commit() #assert os.path.exists(psipred) return psipred
def run(self): from hpf.utilities.paths import ensure if self.filesystem_outfile.executable_key==179: runtime().debug("HPF1 format",self.filesystem_outfile) _format = self.hpf1 else: runtime().debug("HPF2 format",self.filesystem_outfile) _format = self.hpf2 debug("Using decoy format",_format) self.decoy = format(_format, self.prediction_code) if not os.path.exists(self.decoy): debug("NO DECOY FILE",self.prediction_code,self.decoy) #return None else: self.destination = os.path.join(self.scratch,os.path.basename(self.decoy)) if not os.path.exists(self.destination): ensure(os.path.join(self.scratch,self.prediction_code)) self.decoy = self._decoy() else: debug("exists",self.destination) self.decoy = self.destination self.psipred = os.path.join(self.scratch,self.prediction_code,self.prediction_code+".psipred") if not os.path.exists(self.psipred): ensure(os.path.join(self.scratch,self.prediction_code)) self.psipred = self._ss() else: debug("exists",self.psipred) if all(map(os.path.exists,[self.decoy,self.psipred])): runtime().debug("Exported",(self.decoy,self.psipred)) return (self.decoy,self.psipred) else: runtime().debug("Failed to export") return None
def import_family(file): name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_", " ") debug(name, file) session = Session() family = session.query(Family).filter(Family.name == name).first() if family == None: family = Family() family.name = name family.experiment_key = E_ID session.add(family) session.flush() with open(file) as handle: for record in SeqIO.parse(handle, "fasta"): seq = str(record.seq).replace("*", "") if not all([c in string.ascii_uppercase for c in seq]): runtime().println("Malformed Sequence", Family, seq) sha1 = hashlib.sha1(seq).hexdigest() sequence = session.query(Sequence).filter( Sequence.sha1 == sha1).first() if sequence == None: sequence = Sequence() sequence.sha1 = sha1 sequence.sequence = seq id = session.add(sequence) session.flush() debug("Added", sequence) sequenceAc = session.query(SequenceAc).filter( and_(SequenceAc.sequence_key == sequence.id, SequenceAc.ac == record.id)).first() if sequenceAc == None: sequenceAc = SequenceAc() sequenceAc.sequence_key = sequence.id sequenceAc.gi = None sequenceAc.db = "amnh" sequenceAc.ac = record.id sequenceAc.description = "amnh" sequenceAc.taxonomy_id = 0 session.add(sequenceAc) session.flush() protein = session.query(Protein).filter( and_(Protein.sequence_key == sequence.id, Protein.experiment_key == E_ID)).first() if protein == None: protein = Protein() protein.experiment_key = E_ID protein.protein_type = "phylogeny" protein.sequence_key = sequence.id protein.probability = 0 protein.comment = "auto added amnh families" protein.file_key = 0 protein.parse_key = 0 protein.gene_key = 0 session.add(protein) session.flush() debug("Added", protein) if not family in sequence.families: sequence.families.append(family) session.flush() session.commit() session.close() debug("closed") return None
def _blast(fasta): thread_setup() if runtime().opt(FORCE): debug("Forcing re-run") return blast(fasta,force=runtime().opt(FORCE))
def _blast(fasta): thread_setup() if runtime().opt(FORCE): debug("Forcing re-run") return blast(fasta, force=runtime().opt(FORCE))