Exemple #1
0
    def piechart(self, domain_type):
        pie_file = self.output[0]
        pylab.figure(figsize=(5, 5))

        probs, labels = ([], [])
        for type in DOMAIN_TYPES:
            mean_coverage = mean(
                domain_type[type]) if len(domain_type[type]) > 0 else 0.0
            debug(self.family_name, type, mean_coverage)
            probs.append(mean_coverage)
            labels.append(type)  #+":%0.2f" % mean_coverage)

        #ax = pylab.axes([0.6, 0.6, 0.4, 0.4])
        explode = [0.05 for i in xrange(len(DOMAIN_TYPES))]
        patches, texts = pylab.pie(probs,
                                   explode=None,
                                   labels=None,
                                   shadow=False,
                                   colors=DOMAIN_COLORS)
        pylab.figlegend(patches,
                        labels,
                        "lower left",
                        fancybox=True,
                        markerscale=0.2)
        pylab.title(self.family_name)
        pylab.savefig(pie_file)
Exemple #2
0
def do_alignment(cl,parse=True):
    command = str(cl)
    if cl.tempd == None:
        cwd = tempfile.mkdtemp()
    else:
        cwd = cl.cwd
    
    debug("Running mammoth",command)
    p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,stderr=open(os.devnull,"w"),cwd=cwd)
    if not parse:
        return p.communicate()
    
    # Now parse output
    if cl.output:
        # Consume the process' pipe so we don't block
        for l in p.stdout:
            pass
        parse_output = open(cl.output)
    else:
        parse_output = p.stdout
    ini_psi, ini_rms, end_psi, end_rms, zscore, evalue, pred_seq, pred_ss, exp_seq, exp_ss = _parse_mammoth(parse_output)
    # Wait for process to finish
    rcode = p.wait()
    if rcode != 0:
        raise subprocess.CalledProcessError(command,rcode)
    
    mm = MammothAlignment(ini_rms=ini_rms, ini_psi=ini_psi, end_rms=end_rms, end_psi=end_psi, zscore=zscore,
        evalue=evalue, pred_seq=pred_seq, pred_ss=pred_ss, exp_seq=exp_seq, exp_ss=exp_ss)
    mm._cwd = cwd
    return mm
Exemple #3
0
def thread_setup():
    global Session
    if Session == None:
        clear()
        e, b, s = setup()
        debug("Entering subprocess setup", s, Session)
        Session = s
Exemple #4
0
 def parse(self,output):
     """
     Parse the Blast handle and append records to the output file
     """
     with open(output) as handle:
         runtime().debug("Parsing blast results")            
         blast_records = NCBIXML.parse(handle)
         matches = set()
         for blast_record in blast_records:
             for alignment in blast_record.alignments:
                 for hsp in alignment.hsps:
                     n = float(min(hsp.align_length,blast_record.query_letters))
                     m = float(max(hsp.align_length,blast_record.query_letters))
                     if n/m < self.align_perc:
                         continue
                     if hsp.expect > self.expect:
                         continue
                     matches.add(alignment.title.split()[0][4:])
     runtime().debug("Found %i hits" % len(matches))
     if os.path.exists(self.output):
         with open(self.output) as fasta_file:
             already_found = set([r.id for r in SeqIO.parse(fasta_file, "fasta")])
         debug("Found %i  already" % len(matches.intersection(already_found)))
         matches = matches-already_found
     
     debug("Found %i new matches"%len(matches))
     if len(matches) > 0:
         with hddb.connect("ddbCommon") as cursor:
             records = list(hddb.proteins(cursor, sequence_key=matches))
         runtime().debug("Formatting blast results into fasta file")
         with open(self.output,"a") as fasta_file:
             SeqIO.write(records, fasta_file, "fasta")
Exemple #5
0
def thread_setup():
    global Session
    if Session==None:
        clear()
        e,b,s = setup()
        debug("Entering subprocess setup",s,Session)
        Session = s
Exemple #6
0
def makeDirs(path):
    "Raises a PathException if the path cannot be created w/ os.error number as value"
    from hpf.utilities import debug
    debug("Ensuring: %s" % path)
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise PathException("Cannot make dir", e.errno)
 def __enter__(self):
     from hpf.utilities.paths import ensure
     ensure(self.scratch)
     self.session = Session()
     runtime().debug("Loading outfile")
     self.filesystem_outfile = self.session.query(FilesystemOutfile).filter(FilesystemOutfile.prediction_code==self.prediction_code).first()
     runtime().debug("Loading sequence")
     self.sequence = self.session.query(Sequence).get(self.filesystem_outfile.sequence_key)
     debug(self.prediction_code,self.filesystem_outfile,self.sequence)
     return self
Exemple #8
0
 def _do(self):
     for i,fasta in enumerate(self.records()):
         try:
             with NamedTemporaryFile(prefix="blast",dir=os.getcwd()) as blast_out:
                 self.blast(fasta,blast_out.name)
                 self.parse(blast_out.name)
         except Exception, e:
             debug("Failed to blast/parse %i record in %s" % (i,self.fasta))
             debug("\t",e)
             continue
    def _decoy(self):
        """
        Find the decoy file and copy it to the scratch directory.
        """

        assert os.path.exists(self.decoy), self.decoy
        debug("Found",self.decoy)
        destination = self.destination
        shutil.copy(self.decoy, destination)
        #assert os.path.exists(destination)
        debug("Copied to",destination)
        return destination
Exemple #10
0
def find(pattern, dir=os.getcwd()):
    """Searches a directory path and yields regular expression matches."""
    import re
    regex = re.compile(pattern)

    for (path, dames, fnames) in os.walk(dir) :
        for fn in fnames:
            debug(fn)
            abs = os.path.abspath(join(path, fn))
            match = regex.search(abs)
            if match:
                yield abs
Exemple #11
0
def blast(fasta, db="hpf_protein", force=False):
    """
    Map families to the database.
    """
    runtime().debug(fasta)
    file = os.path.abspath(fasta)
    base = ".".join(os.path.basename(file).split(".")[:-1])
    name = base.replace("_", " ")
    dir = base
    subprocess.call("mkdir -p %s" % base, shell=True)
    runtime().debug(dir, fasta, base)
    cwd = os.getcwd()
    os.chdir(dir)

    try:
        raise Exception(
            "This has been modified like crazy, don't run as is, make sure this is correct"
        )
        #runtime().pushd(dir)
        #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force)
        # Cluster the families representatives before blasting everything to HPF
        #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run()
        #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force)
        #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force)
        #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force)
        #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force)
        #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force)
        #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force)

        blast_matches = base + ".hpf.fasta"

        session = Session()
        family = session.query(Family).filter(Family.name == name).one()
        debug(family)
        with open(blast_matches) as handle:
            for record in SeqIO.parse(handle, "fasta"):
                map = session.query(FamilySequence).filter(
                    and_(FamilySequence.family_key == family.id,
                         FamilySequence.sequence_key == int(
                             record.id))).first()
                if map == None:
                    map = FamilySequence()
                    map.family_key = family.id
                    map.sequence_key = int(record.id)
                    debug(map)
                    session.add(map)

        session.commit()
        session.close()

        #runtime().popd()
    finally:
        os.chdir(cwd)
Exemple #12
0
 def _do(self):
     for i, fasta in enumerate(self.records()):
         try:
             with NamedTemporaryFile(prefix="blast",
                                     dir=os.getcwd()) as blast_out:
                 self.blast(fasta, blast_out.name)
                 self.parse(blast_out.name)
         except Exception, e:
             debug("Failed to blast/parse %i record in %s" %
                   (i, self.fasta))
             debug("\t", e)
             continue
 def _db(self):
     """
     Get the prediction from the database and write it to a file.
     """
     debug("Getting psipred from database")
     from hpf.pdb.psipred import PsipredWriter
     psipred = self.psipred
     with open(psipred,"w") as handle:
         PsipredWriter().write(handle, 
                               self.sequence.psipred.psipred, 
                               self.sequence.record)
     #assert os.path.exists(psipred)
     return psipred
Exemple #14
0
def blast(fasta,db="hpf_protein",force=False):
    """
    Map families to the database.
    """
    runtime().debug(fasta)
    file = os.path.abspath(fasta)
    base = ".".join(os.path.basename(file).split(".")[:-1])
    name = base.replace("_"," ")
    dir = base
    subprocess.call("mkdir -p %s"%base, shell=True)
    runtime().debug(dir,fasta,base)
    cwd = os.getcwd()
    os.chdir(dir)
    
    try:
        raise Exception("This has been modified like crazy, don't run as is, make sure this is correct")
        #runtime().pushd(dir)
        #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force)
        # Cluster the families representatives before blasting everything to HPF
        #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run()
        #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force)
        #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force)
        #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force)
        #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force)
        #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force)
        #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force)
        
        blast_matches = base+".hpf.fasta"
        
        session = Session()
        family = session.query(Family).filter(Family.name==name).one()
        debug(family)
        with open(blast_matches) as handle:
            for record in SeqIO.parse(handle, "fasta"):
                map = session.query(FamilySequence).filter(and_(FamilySequence.family_key==family.id, FamilySequence.sequence_key==int(record.id))).first()
                if map == None:
                    map = FamilySequence()
                    map.family_key=family.id
                    map.sequence_key = int(record.id)
                    debug(map)
                    session.add(map)
                
        session.commit()
        session.close()
            
        #runtime().popd()
    finally:
        os.chdir(cwd)
Exemple #15
0
def import_family(file):
    name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_"," ")
    debug(name, file)
    session = Session()
    family = session.query(Family).filter(Family.name==name).first()
    if family==None:
        family = Family()
        family.name = name
        family.experiment_key = E_ID
        session.add(family)
        session.flush()
    
    with open(file) as handle:
        for record in SeqIO.parse(handle,"fasta"):
            seq = str(record.seq).replace("*","")
            if not all([c in string.ascii_uppercase for c in seq]):
                runtime().println("Malformed Sequence", Family, seq)
            sha1 = hashlib.sha1(seq).hexdigest()
            sequence = session.query(Sequence).filter(Sequence.sha1==sha1).first()
            if sequence==None:
                sequence = Sequence()
                sequence.sha1 = sha1
                sequence.sequence=seq
                id = session.add(sequence)
                session.flush()
                debug("Added",sequence)
            sequenceAc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==sequence.id, SequenceAc.ac==record.id)).first()
            if sequenceAc==None:
                sequenceAc = SequenceAc()
                sequenceAc.sequence_key=sequence.id
                sequenceAc.gi = None
                sequenceAc.db = "amnh"
                sequenceAc.ac = record.id
                sequenceAc.description = "amnh"
                sequenceAc.taxonomy_id = 0
                session.add(sequenceAc)
                session.flush()
            protein = session.query(Protein).filter(and_(Protein.sequence_key==sequence.id,Protein.experiment_key==E_ID)).first()
            if protein==None:
                protein=Protein()
                protein.experiment_key=E_ID
                protein.protein_type="phylogeny"
                protein.sequence_key = sequence.id
                protein.probability = 0
                protein.comment = "auto added amnh families"
                protein.file_key = 0
                protein.parse_key = 0
                protein.gene_key = 0
                session.add(protein)
                session.flush()
                debug("Added",protein)
            if not family in sequence.families:
                sequence.families.append(family)
                session.flush()
    session.commit()        
    session.close()
    debug("closed")
    return None
Exemple #16
0
 def piechart(self, domain_type):
     pie_file = self.output[0]
     pylab.figure(figsize=(5,5))
     
     probs,labels = ([],[])
     for type in DOMAIN_TYPES:
         mean_coverage = mean(domain_type[type]) if len(domain_type[type])>0 else 0.0
         debug(self.family_name,type,mean_coverage)
         probs.append(mean_coverage)
         labels.append(type)#+":%0.2f" % mean_coverage)
     
     #ax = pylab.axes([0.6, 0.6, 0.4, 0.4])
     explode = [0.05 for i in xrange(len(DOMAIN_TYPES))]
     patches, texts = pylab.pie(probs,explode=None,labels=None,shadow=False,colors=DOMAIN_COLORS)
     pylab.figlegend(patches, labels, "lower left", fancybox=True, markerscale=0.2)
     pylab.title(self.family_name)
     pylab.savefig(pie_file)
Exemple #17
0
def annotate_records(fasta):
    """Attach length of domain_type coverage over each protein in fasta file"""
    with open(fasta) as handle:
        records = list(SeqIO.parse(handle, "fasta"))
        protein = {}
        for r in records:
            protein[int(r.id)] = r
            r.annotations['domain_type'] = defaultdict(lambda: 0)

    with hddb.connect("ddbCommon") as cursor:
        query = """select d.parent_sequence_key, d.domain_type, sum(length(sequence)) 
        from domain d join sequence s on d.domain_sequence_key=s.id 
        where parent_sequence_key in (%s) 
        group by domain_type""" % ",".join([str(id) for id in protein.keys()])
        debug(query)
        cursor.execute(query)
        for protein_key, domain_type, dlen in cursor.fetchall():
            debug(len(protein[protein_key].seq), domain_type, dlen)
            protein[protein_key].annotations['domain_type'][domain_type] = dlen
    return protein.values()
Exemple #18
0
def annotate_records(fasta):
    """Attach length of domain_type coverage over each protein in fasta file"""
    with open(fasta) as handle:
        records = list(SeqIO.parse(handle, "fasta"))
        protein = {}
        for r in records:
            protein[int(r.id)] = r
            r.annotations['domain_type'] = defaultdict(lambda: 0)
        
    with hddb.connect("ddbCommon") as cursor:
        query = """select d.parent_sequence_key, d.domain_type, sum(length(sequence)) 
        from domain d join sequence s on d.domain_sequence_key=s.id 
        where parent_sequence_key in (%s) 
        group by domain_type""" % ",".join([str(id) for id in protein.keys()])
        debug(query)
        cursor.execute(query)
        for protein_key, domain_type, dlen in cursor.fetchall():
            debug(len(protein[protein_key].seq),domain_type,dlen)
            protein[protein_key].annotations['domain_type'][domain_type] = dlen
    return protein.values()
Exemple #19
0
    def parse(self, output):
        """
        Parse the Blast handle and append records to the output file
        """
        with open(output) as handle:
            runtime().debug("Parsing blast results")
            blast_records = NCBIXML.parse(handle)
            matches = set()
            for blast_record in blast_records:
                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        n = float(
                            min(hsp.align_length, blast_record.query_letters))
                        m = float(
                            max(hsp.align_length, blast_record.query_letters))
                        if n / m < self.align_perc:
                            continue
                        if hsp.expect > self.expect:
                            continue
                        matches.add(alignment.title.split()[0][4:])
        runtime().debug("Found %i hits" % len(matches))
        if os.path.exists(self.output):
            with open(self.output) as fasta_file:
                already_found = set(
                    [r.id for r in SeqIO.parse(fasta_file, "fasta")])
            debug("Found %i  already" %
                  len(matches.intersection(already_found)))
            matches = matches - already_found

        debug("Found %i new matches" % len(matches))
        if len(matches) > 0:
            with hddb.connect("ddbCommon") as cursor:
                records = list(hddb.proteins(cursor, sequence_key=matches))
            runtime().debug("Formatting blast results into fasta file")
            with open(self.output, "a") as fasta_file:
                SeqIO.write(records, fasta_file, "fasta")
Exemple #20
0
def plot(families, filename="ginzu.svg"):
    #pylab.figure(figsize=(7,7))
    families.sort(cmp=lambda x,y: cmp(x[1]['psiblast'], y[1]['psiblast']))
    
    fp = font.FontProperties(size="x-small")
    ax = pylab.axes([0.3, 0.0, .6, .7])
    ind = arange(len(families))        
    width = .35
    plots = {}
    domain_dict = {}
    for type in DOMAIN_TYPES:
        domain_dict[type] = []
    
    names = []
    for name,domains in families:
        names.append(name)
        for type in DOMAIN_TYPES:
            domain_dict[type].append(domains[type])

    sum_bottom = [0 for i in arange(len(families))]
    debug("length of sum_bottom:", len(sum_bottom))
    for dkey in DOMAIN_TYPES:
        #plots[dkey] = pylab.bar(ind, domain_dict[dkey], color=self.color[dkey],bottom=sum_bottom)
        plots[dkey] = pylab.barh(ind, domain_dict[dkey], color=colors[dkey],left=sum_bottom)
        sum_bottom = map(sum, zip(sum_bottom,domain_dict[dkey]))
        
    #pylab.xticks(ind+width/2, organisms_dict.keys(), rotation=45)
    pylab.yticks(ind+width/2, names, size='xx-small')
    pylab.xticks()
    pylab.title("Ginzu domain frequencies")
    
    pylab.legend( [plots[key][0] for key in DOMAIN_TYPES], DOMAIN_TYPES, prop=fp, markerscale=.5, loc=(.85,.85))
    #kdrew: a little filename manipulation
    #f_parts = self.yield_plot_filename.rpartition('.')
    #filename = f_parts[0] + "_" + self.experiment_key + "." + f_parts[2]
    pylab.savefig(filename)
    def _psipred(self):
        """
        Run psipred on the sequence.
        """
        debug("Running psipred prediction")
        from hpf.seq import TemporaryRecordFile
        from hpf.pdb.psipred import Psipred, PsipredOptions
        from Bio import SeqIO
        fasta = os.path.join(self.scratch,self.prediction_code,self.prediction_code+".fasta")
        with open(fasta,"w") as handle:
            SeqIO.write([self.sequence.record], handle, "fasta")
        psipred = self.psipred
        from Bio.Blast.NCBIStandalone import blastpgp
        chk = fasta+".chk"
        import subprocess
        cmd = subprocess.Popen(["which", "blastpgp"], stdout=subprocess.PIPE).communicate()[0].strip()
        debug("Using",cmd)
        result,error = blastpgp(cmd, 
                                "nr", 
                                fasta, 
                                npasses=3,
                                checkpoint_outfile=chk,
                                expectation=1e-4,
                                model_threshold=1e-4,
                                align_outfile="/dev/null")    
        debug(result.readlines())
        debug(error.readlines())

        options = PsipredOptions(fasta,
                                 profile=chk,
                                 output=psipred+".1",
                                 output2=psipred+".2",
                                 horiz=psipred,
                                 cwd = os.path.join(self.scratch,self.prediction_code))
        prediction = Psipred(options).run()
        db_pred = PsipredFactory().create(prediction,sequence_key=self.sequence.id)
        self.session.add(db_pred)
        self.session.commit()
        #assert os.path.exists(psipred)
        return psipred
    def run(self):
        from hpf.utilities.paths import ensure
        
        if self.filesystem_outfile.executable_key==179:
            runtime().debug("HPF1 format",self.filesystem_outfile)
            _format = self.hpf1
        else:
            runtime().debug("HPF2 format",self.filesystem_outfile)
            _format = self.hpf2

        debug("Using decoy format",_format)
        self.decoy = format(_format, self.prediction_code)
        if not os.path.exists(self.decoy):
            debug("NO DECOY FILE",self.prediction_code,self.decoy)
            #return None
        else:
            self.destination = os.path.join(self.scratch,os.path.basename(self.decoy))
            if not os.path.exists(self.destination):
                ensure(os.path.join(self.scratch,self.prediction_code))
                self.decoy = self._decoy()
            else:
                debug("exists",self.destination)
                self.decoy = self.destination
        
        self.psipred = os.path.join(self.scratch,self.prediction_code,self.prediction_code+".psipred")
        if not os.path.exists(self.psipred):
            ensure(os.path.join(self.scratch,self.prediction_code))
            self.psipred = self._ss()
        else:
            debug("exists",self.psipred)

        if all(map(os.path.exists,[self.decoy,self.psipred])):
            runtime().debug("Exported",(self.decoy,self.psipred))
            return (self.decoy,self.psipred)
        else:
            runtime().debug("Failed to export")
            return None
Exemple #23
0
def import_family(file):
    name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_", " ")
    debug(name, file)
    session = Session()
    family = session.query(Family).filter(Family.name == name).first()
    if family == None:
        family = Family()
        family.name = name
        family.experiment_key = E_ID
        session.add(family)
        session.flush()

    with open(file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            seq = str(record.seq).replace("*", "")
            if not all([c in string.ascii_uppercase for c in seq]):
                runtime().println("Malformed Sequence", Family, seq)
            sha1 = hashlib.sha1(seq).hexdigest()
            sequence = session.query(Sequence).filter(
                Sequence.sha1 == sha1).first()
            if sequence == None:
                sequence = Sequence()
                sequence.sha1 = sha1
                sequence.sequence = seq
                id = session.add(sequence)
                session.flush()
                debug("Added", sequence)
            sequenceAc = session.query(SequenceAc).filter(
                and_(SequenceAc.sequence_key == sequence.id,
                     SequenceAc.ac == record.id)).first()
            if sequenceAc == None:
                sequenceAc = SequenceAc()
                sequenceAc.sequence_key = sequence.id
                sequenceAc.gi = None
                sequenceAc.db = "amnh"
                sequenceAc.ac = record.id
                sequenceAc.description = "amnh"
                sequenceAc.taxonomy_id = 0
                session.add(sequenceAc)
                session.flush()
            protein = session.query(Protein).filter(
                and_(Protein.sequence_key == sequence.id,
                     Protein.experiment_key == E_ID)).first()
            if protein == None:
                protein = Protein()
                protein.experiment_key = E_ID
                protein.protein_type = "phylogeny"
                protein.sequence_key = sequence.id
                protein.probability = 0
                protein.comment = "auto added amnh families"
                protein.file_key = 0
                protein.parse_key = 0
                protein.gene_key = 0
                session.add(protein)
                session.flush()
                debug("Added", protein)
            if not family in sequence.families:
                sequence.families.append(family)
                session.flush()
    session.commit()
    session.close()
    debug("closed")
    return None
Exemple #24
0
def _blast(fasta):
    thread_setup()
    if runtime().opt(FORCE):
        debug("Forcing re-run")
    return blast(fasta,force=runtime().opt(FORCE))
Exemple #25
0
def _blast(fasta):
    thread_setup()
    if runtime().opt(FORCE):
        debug("Forcing re-run")
    return blast(fasta, force=runtime().opt(FORCE))