コード例 #1
0
ファイル: oid.py プロジェクト: bsmithers/hpf
def import_dir(dir):
    assert os.path.isdir(dir)
    family_name = os.path.basename(os.path.abspath(dir))
    runtime().debug("Family Name", family_name)
    join = lambda *x: os.path.join(dir, *x)

    # Define the file names
    align_file = join("FAMILY.index")
    align_colcull_log = join("colcull.log")
    align_seqcull_log = join("seqcull.log")
    tree_file = join("oid.index.reroot")
    diagchars_file = join("diag.chars")

    from hpf.utilities import find

    codeml_file = join("%s_colcull.lrt" % family_name)
    print codeml_file, os.path.exists(codeml_file)
    codeml_file = codeml_file if os.path.exists(codeml_file) else None
    # Make sure all of the files exist
    for file in [tree_file, align_file, codeml_file, align_colcull_log, align_seqcull_log]:
        if file:
            assert os.path.exists(file), file
    runtime().debug(family_name, align_file, tree_file, codeml_file)
    oid = OIDImporter(
        familyName=family_name,
        treeFile=tree_file,
        treeDiagCharsFile=diagchars_file,
        alignFile=align_file,
        alignColcullLog=align_colcull_log,
        alignSeqcullLog=align_seqcull_log,
        codemlFile=codeml_file,
        alignFormat="fasta",
        oid_key=oid_key,
    )
    oid.merge()
コード例 #2
0
ファイル: publications.py プロジェクト: bsmithers/hpf
def pubmed(gi,ids,query):
    """
    Get the pubmed articles listed by *ids
    """
    _ids=",".join(ids)
    for id in ids:
        handle = efetch(db="pubmed",id=id,retmode='xml',rettype='xml',retmax=MAX_RETURN)
        try:
            #print handle.read()
            results = eread(handle)
            for citation in results:
                #runtime().debug(citation.keys())
                citation = citation['MedlineCitation']
                pmid = citation['PMID']
                article = citation['Article']
                title = article['ArticleTitle']
                journal = article['Journal']['Title']
                try:
                    date = citation['DateCompleted'] if citation.has_key('DateCompleted') else citation['DateCreated']
                    year = date['Year']
                    month = date['Month']
                    day = date['Day']
                    datetime = "%s-%s-%s" % (year,month,day)
                except:
                    datetime = '0000-00-00'
                
                runtime().debug("Parsed pmid:%s" % id)
                yield Citation(gi, pmid, title, journal, datetime, query)
        except:
            runtime().debug("Failure fetching pmid:%s" % id)
            continue
        finally:
            handle.close()
コード例 #3
0
ファイル: psipred.py プロジェクト: bsmithers/hpf
 def run(self, twopass=True):
     mtx = self.options._mtx()
     output = self.options.output
     psipred = subprocess.Popen("which psipred", stdout=subprocess.PIPE, shell=True).communicate()[0]
     # Cut the /bin/psipred off
     bin_dir = os.path.split(psipred)[0]
     root_dir = os.path.split(bin_dir)[0]
     data = os.path.join(root_dir,"data")
     if self.options.single:
         cmd = """psipred %s 
                 %s/weights_s.dat 
                 %s/weights_s.dat2 
                 %s/weights_s.dat3 > %s""" % (mtx,data,data,data,output)
     else:
         cmd = """psipred %s %s/weights.dat %s/weights.dat2 %s/weights.dat3 %s/weights.dat4 > %s""" % (mtx,data,data,data,data,output)
     #print cmd
     runtime().debug(cmd)
     subprocess.check_call(cmd, shell=True, cwd=self.options.cwd)
     
     output2 = self.options.output2
     horiz = self.options.horiz
     cmd = """psipass2 %s/weights_p2.dat 1 1.0 1.0 %s %s > %s""" % (data, output2, output, horiz)
     runtime().debug(cmd)
     subprocess.check_call(cmd, shell=True, cwd=self.options.cwd)
     with open(self.options.horiz) as handle:
         pred = parse(handle)
     return pred
コード例 #4
0
ファイル: __init__.py プロジェクト: bsmithers/hpf
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None):
    """
    Return the selected proteins as SeqRecord objects
    """
    query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id
        from hpf.experiment e 
        join bddb.protein p on e.id=p.experiment_key
        join ddbCommon.sequence s on p.sequence_key=s.id 
        """
    assert experiment!= None or sequence_key != None
    if experiment != None or filter_experiments==True or sequence_key != None:
        query += " where "
    if experiment:
        if not hasattr(experiment, "__iter__"):
            experiment = [experiment]
        query += " e.id in (%s)" % (",".join([str(key) for key in experiment]))
    if filter_experiments:
        t = " e.taxonomy_id!=0"
        query += " and "+t if experiment else t
    if sequence_key:
        t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key]))
        query += " and "+t if experiment or filter_experiments else t
    runtime().debug(query)
    cursor.execute(query)
    runtime().debug("Fetching")
    for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall():
        record = SeqRecord(Seq(sequence), str(id), description=e_name)
        record.annotations = {"taxonomy_id":taxonomy_id,
                              "experiment_key":e_id,
                              "organism":e_name}
        yield record
コード例 #5
0
ファイル: psipred.py プロジェクト: bsmithers/hpf
    def _mtx(self, name=None):
        """
        echo $tmproot.chk > $tmproot.pn
        echo $tmproot.fasta > $tmproot.sn
        $ncbidir/makemat -P $tmproot
        
        or
        
        $execdir/seq2mtx $1 > $tmproot.mtx
        """
        if not name:
            name = ".".join(os.path.basename(self.profile).split(".")[:-1])
        mtx = "%s.mtx" %name

        # Either generate the matrix from a profile or sequence alone
        if self.single:
            cmd = "seq2mtx %s > %s" % (self.fasta, mtx)
            #print cmd
            runtime().debug(cmd)
            subprocess.check_call(cmd, shell=True, cwd=self.cwd)
        else:
            for file,link in [(self.profile,name+".pn"),(self.fasta,name+".sn")]:
                subprocess.check_call("echo %s > %s" % (file,link), shell=True, cwd=self.cwd)
            makemat = subprocess.Popen("which makemat", shell=True, stdout=subprocess.PIPE).communicate()[0].strip()
            cmd = "%s -P %s" % (makemat, name)
            #print cmd
            runtime().debug(cmd)
            subprocess.check_call(cmd,shell=True, cwd=self.cwd)
        return mtx
コード例 #6
0
 def parse(self, handle):
     start = False
     for line in handle:
         if line.strip().startswith("Positively Selected Sites : Model 8"):
             start = True
             continue
         if not start:
             continue
         if line.startswith("Common"):
             break
         pieces = line.strip().split()
         if len(pieces) != 6 or not pieces[0].isdigit():
             continue
         runtime().debug("codeml", line)
         column, aa, pr, pm, _pm_, se = pieces
         # Indices start at 1 in CodeML, convert to 0
         column = column - 1
         assert column >= 0
         pr = pr.split("*")[0]
         original_column = self.mapper[column]
         from hpf.hddb.db import PositiveSelection
         ps = PositiveSelection(codeml_key=self.codeml.id,
                                column=original_column,
                                probability=pr,
                                post_mean=pm,
                                stderr=se)
         yield ps
コード例 #7
0
def prep(prediction_code):
    runtime().debug("Running on",prediction_code)
    with PrepPredictionCode(prediction_code,
                            runtime().opt(SCRATCH),
                            runtime().opt(HPF1_DECOYS),
                            runtime().opt(HPF2_DECOYS)) as p:
        return p()
コード例 #8
0
def main(file, fam):
    global family, session
    session = Session()
    #    if isinstance(fam, basestring):
    #        f = Family()
    #        f.name = fam
    #        f.experiment_key = 0
    #        session.merge(f)
    #        session.flush()
    #        family = f.id
    #    else:
    family = int(fam)

    #runtime().set_debug(1)
    runtime().debug("Using file", file)
    with open(file) as handle:
        from hpf.amnh.oid import phylip
        records = phylip(handle)._records

    runtime().debug("Found %i records" % len(records))
    from hpf.amnh.oid import index
    index(records)
    records = map(merge, records)
    session.commit()
    session.close()
コード例 #9
0
ファイル: publications.py プロジェクト: bsmithers/hpf
def names(gi):
    columns = ["protgi","gi"]
    tables = [("accession",0),
              #("bind",0),
              ("codedby",0),
              ("genbank",0),
              #("geneid",0),
              ("genename",0),
              #("kegg",0),
              ("locustag",0),
              ("pdb",0),
              ("pfam",0),
              ("unigene",0),
              ("uniprotkb",0)]
    query = "select * from %s where %s=%i"
    all = set()
    with MySQLdb.connect(db="synonyms3",host="err.bio.nyu.edu",user="******",passwd="patrick_nyu") as cursor:
        for name, col in tables:
            table = "refseq_%s" % name
            column = columns[col]
            q = query % (table,column,gi)
            runtime().debug(q)
            cursor.execute(q)
            rows = cursor.fetchall()
            for row in rows:
                if row[1]!=None:
                    all.add(str(row[1]))
    return all
コード例 #10
0
def import_dir(dir):
    assert os.path.isdir(dir)
    family_name = os.path.basename(os.path.abspath(dir))
    runtime().debug("Family Name", family_name)
    join = lambda *x: os.path.join(dir, *x)

    # Define the file names
    align_file = join("FAMILY.index")
    align_colcull_log = join("colcull.log")
    align_seqcull_log = join("seqcull.log")
    tree_file = join("oid.index.reroot")
    diagchars_file = join("diag.chars")

    from hpf.utilities import find
    codeml_file = join("%s_colcull.lrt" % family_name)
    print codeml_file, os.path.exists(codeml_file)
    codeml_file = codeml_file if os.path.exists(codeml_file) else None
    # Make sure all of the files exist
    for file in [
            tree_file, align_file, codeml_file, align_colcull_log,
            align_seqcull_log
    ]:
        if file:
            assert os.path.exists(file), file
    runtime().debug(family_name, align_file, tree_file, codeml_file)
    oid = OIDImporter(familyName=family_name,
                      treeFile=tree_file,
                      treeDiagCharsFile=diagchars_file,
                      alignFile=align_file,
                      alignColcullLog=align_colcull_log,
                      alignSeqcullLog=align_seqcull_log,
                      codemlFile=codeml_file,
                      alignFormat="fasta",
                      oid_key=oid_key)
    oid.merge()
コード例 #11
0
ファイル: psipred.py プロジェクト: bsmithers/hpf
 def run(self, ):
     mtx = self.options._mtx()
     output = self.options.output
     psipred = subprocess.Popen("which psipred", stdout=subprocess.PIPE, shell=True).communicate()[0]
     # Cut the /bin/psipred off
     bin_dir = os.path.split(psipred)[0]
     root_dir = os.path.split(bin_dir)[0]
     data = os.path.join(root_dir,"data")
     
     # Single run (from runpsipred_single script):
     #psipred $tmproot.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $rootname.ss
     # Normal run (from runpsipred script):
     #psipred $tmproot.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $rootname.ss
     cmd = "psipred {0} {1}/weights.dat {1}/weights.dat2 {1}/weights.dat3 > {2}".format(mtx, data, output)
     
     runtime().debug(cmd)
     subprocess.check_call(cmd, shell=True, cwd=self.options.cwd)
     
     output2 = self.options.output2
     horiz = self.options.horiz
     
     # psipass2 command (from runpsipred script):
     #psipass2 $datadir/weights_p2.dat 1 1.0 1.0 $rootname.ss2 $rootname.ss > $rootname.horiz
     cmd = "psipass2 {0}/weights_p2.dat 1 1.0 1.0 {1} {2} > {3}".format(data, output2, output, horiz)
     runtime().debug(cmd)
     subprocess.check_call(cmd, shell=True, cwd=self.options.cwd)
     
     # Note: Output format between Psipred versions 2.5 and 3.2 is the same (woo)
     with open(self.options.horiz) as handle:
         pred = parse(handle)
     return pred
コード例 #12
0
ファイル: oid.py プロジェクト: bsmithers/hpf
    def parse(self, handle):
        start = False
        for line in handle:
            if line.strip().startswith("Positively Selected Sites : Model 8"):
                start = True
                continue
            if not start:
                continue
            if line.startswith("Common"):
                break
            pieces = line.strip().split()
            if len(pieces) != 6 or not pieces[0].isdigit():
                continue
            runtime().debug("codeml", line)
            column, aa, pr, pm, _pm_, se = pieces
            # Indices start at 1 in CodeML, convert to 0
            column = column - 1
            assert column >= 0
            pr = pr.split("*")[0]
            original_column = self.mapper[column]
            from hpf.hddb.db import PositiveSelection

            ps = PositiveSelection(
                codeml_key=self.codeml.id, column=original_column, probability=pr, post_mean=pm, stderr=se
            )
            yield ps
コード例 #13
0
ファイル: paml.py プロジェクト: bsmithers/hpf
 def run(self):
     import subprocess,os
     with self.ctl as ctl_file:
         cmd ="codeml "+ctl_file
         runtime().debug(cmd)
         subprocess.check_call(cmd,shell=True,cwd=self.dir,stdout=open(os.devnull,"w"))
     with open(os.path.join(self.dir,"rst")) as handle:
         return CodeMLParser().parse(handle)
コード例 #14
0
ファイル: publications.py プロジェクト: bsmithers/hpf
def tasks():
    query = "select distinct gi from domain_sccs d join sequenceAc a on d.parent_sequence_key=a.sequence_key where ac is not NULL and d.sccs is not NULL and d.domain_type !='psiblast'"
    with MySQLdb.connect(host="127.0.0.1",passwd="patrick_nyu",db="hpf") as cursor:
        runtime().debug(query)
        cursor.execute(query)
        runtime().debug("Fetching")
        all = cursor.fetchall()
        return [gi[0] for gi in all]
コード例 #15
0
def tasks(*args):
    if len(args) == 0:
        assert False
        #tasks = [os.path.join(dir,dir+".fas") for dir in os.listdir(os.getcwd())]
    else:
        tasks = args
    runtime().debug("TASKS", tasks)
    return tasks
コード例 #16
0
ファイル: map_families.py プロジェクト: dpenfoldbrown/hpf
def tasks(*args):
    if len(args) == 0:
        assert False
        #tasks = [os.path.join(dir,dir+".fas") for dir in os.listdir(os.getcwd())]
    else:
        tasks = args
    runtime().debug("TASKS",tasks)
    return tasks
コード例 #17
0
ファイル: _import_family.py プロジェクト: bsmithers/hpf
def import_family(file):
    name = ".".join(os.path.basename(file).split(".")[:-1]).replace("_"," ")
    debug(name, file)
    session = Session()
    family = session.query(Family).filter(Family.name==name).first()
    if family==None:
        family = Family()
        family.name = name
        family.experiment_key = E_ID
        session.add(family)
        session.flush()
    
    with open(file) as handle:
        for record in SeqIO.parse(handle,"fasta"):
            seq = str(record.seq).replace("*","")
            if not all([c in string.ascii_uppercase for c in seq]):
                runtime().println("Malformed Sequence", Family, seq)
            sha1 = hashlib.sha1(seq).hexdigest()
            sequence = session.query(Sequence).filter(Sequence.sha1==sha1).first()
            if sequence==None:
                sequence = Sequence()
                sequence.sha1 = sha1
                sequence.sequence=seq
                id = session.add(sequence)
                session.flush()
                debug("Added",sequence)
            sequenceAc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==sequence.id, SequenceAc.ac==record.id)).first()
            if sequenceAc==None:
                sequenceAc = SequenceAc()
                sequenceAc.sequence_key=sequence.id
                sequenceAc.gi = None
                sequenceAc.db = "amnh"
                sequenceAc.ac = record.id
                sequenceAc.description = "amnh"
                sequenceAc.taxonomy_id = 0
                session.add(sequenceAc)
                session.flush()
            protein = session.query(Protein).filter(and_(Protein.sequence_key==sequence.id,Protein.experiment_key==E_ID)).first()
            if protein==None:
                protein=Protein()
                protein.experiment_key=E_ID
                protein.protein_type="phylogeny"
                protein.sequence_key = sequence.id
                protein.probability = 0
                protein.comment = "auto added amnh families"
                protein.file_key = 0
                protein.parse_key = 0
                protein.gene_key = 0
                session.add(protein)
                session.flush()
                debug("Added",protein)
            if not family in sequence.families:
                sequence.families.append(family)
                session.flush()
    session.commit()        
    session.close()
    debug("closed")
    return None
コード例 #18
0
def main(*args):
    runtime().set_debug(1)
    pool = processor(synchronous=runtime().opt(SYNCHRONOUS), raise_errors=False)
    runtime().debug("Using processor",pool)
    pool.make_tasks(tasks)
    mcm_tasks = [t for t in pool.run(prep) if t !=None]
    import cPickle
    with open("exported.pickle","w") as handle:
        cPickle.dump(mcm_tasks,handle)
コード例 #19
0
ファイル: template.py プロジェクト: bsmithers/hpf
def main(*args):
    # calls processor with a the keyword argument 'synchronous'. runtime() creates a new Runtime obj. or accesses
    # the existing runtime object, and .opt(SYNCHRONOUS) returns the value of the synchronous option of the Runtime
    # object. 
    # processor returns a Map, SGEArray, or PBSArrayProcessor object. NOTE: currently, PBSArrayProcessor is not fully implemented.
    pool = processor(synchronous=runtime().opt(SYNCHRONOUS))
    runtime().debug("Using processor",pool)
    pool.make_tasks(None)
    consume(pool.run(None))
コード例 #20
0
ファイル: processing.py プロジェクト: bsmithers/hpf
 def find_num_processors():
     """Utility function to find number of processors for multiprocessing"""
     from numpy.distutils import cpuinfo
     if isinstance(cpuinfo.cpu.info,list):
         num_procs = len(cpuinfo.cpu.info)
     else:
         num_procs = int(cpuinfo.cpu.info['sysctl_hw']['hw.availcpu'])
     runtime().debug("Auto-discovered %i processors" % num_procs)
     return num_procs
コード例 #21
0
ファイル: __init__.py プロジェクト: bsmithers/hpf
def ginzu_svg(sequence_key, width=800):
    """
    Return the vector graphic for GINZU domains.
    """
    import subprocess
    cmd = "perl %s %i %i" % (os.path.join(SCRIPTS_FOLDER,"svg.pl"), sequence_key, width)
    runtime().debug(cmd)
    print cmd
    svg = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]+"\n"
    return svg
コード例 #22
0
ファイル: muscle.py プロジェクト: bsmithers/hpf
    def _do(self):
        muscle = Muscle(MuscleOptions(input=self.fasta, output=self.output, maxhours=1, **self.kwargs))
        runtime().debug("Performing Muscle w/", self.fasta)
        out = muscle.run()
        if self.phylip:
            from hpf.phylip import clustal_to_phylip

            return clustal_to_phylip(self.output)
        else:
            return self.output
コード例 #23
0
ファイル: mcm.py プロジェクト: bsmithers/hpf
def tasks(decoy_dir, psipred_dir):
    from hpf.utilities.paths import find
    runtime().debug("Searching for decoys",decoy_dir)
    decoys = list(find("[a-z]{2}[0-9]{3}\.((result)|(out))\.gz",dir=decoy_dir))
    runtime().debug("Found",len(decoys),"decoys")
    for decoy in decoys:
        prediction_code = pred_code(decoy)
        psipred = os.path.join(psipred_dir,prediction_code,prediction_code+".psipred")
        if os.path.exists(psipred):
            yield (decoy,psipred)
コード例 #24
0
 def __enter__(self):
     from hpf.utilities.paths import ensure
     ensure(self.scratch)
     self.session = Session()
     runtime().debug("Loading outfile")
     self.filesystem_outfile = self.session.query(FilesystemOutfile).filter(FilesystemOutfile.prediction_code==self.prediction_code).first()
     runtime().debug("Loading sequence")
     self.sequence = self.session.query(Sequence).get(self.filesystem_outfile.sequence_key)
     debug(self.prediction_code,self.filesystem_outfile,self.sequence)
     return self
コード例 #25
0
def main(*files):
    global session
    #runtime().set_debug(1)
    session = Session()
    for file in files:
        runtime().debug("Mapping file",file)
        with open(file) as handle:
            records = list(SeqIO.parse(handle,"fasta"))
        runtime().debug("Found %i records" % len(records))
        with open(file+".hpf","w") as handle:
            SeqIO.write(map(rename,records), handle, "fasta")
コード例 #26
0
ファイル: __init__.py プロジェクト: bsmithers/hpf
def tunnel(sleep=None):
    """
    Ensure a tunnel to the uwashinton mysql server is running
    """
    from scripts import TUNNEL
    if sleep!=None:
        import time, random
        time.sleep(random.random()*sleep)
    runtime().debug(TUNNEL)
    import subprocess
    with open(os.devnull) as handle:
        subprocess.call(TUNNEL,shell=True,stdout=handle,stderr=handle)
コード例 #27
0
def blast(fasta, db="hpf_protein", force=False):
    """
    Map families to the database.
    """
    runtime().debug(fasta)
    file = os.path.abspath(fasta)
    base = ".".join(os.path.basename(file).split(".")[:-1])
    name = base.replace("_", " ")
    dir = base
    subprocess.call("mkdir -p %s" % base, shell=True)
    runtime().debug(dir, fasta, base)
    cwd = os.getcwd()
    os.chdir(dir)

    try:
        raise Exception(
            "This has been modified like crazy, don't run as is, make sure this is correct"
        )
        #runtime().pushd(dir)
        #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force)
        # Cluster the families representatives before blasting everything to HPF
        #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run()
        #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force)
        #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force)
        #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force)
        #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force)
        #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force)
        #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force)

        blast_matches = base + ".hpf.fasta"

        session = Session()
        family = session.query(Family).filter(Family.name == name).one()
        debug(family)
        with open(blast_matches) as handle:
            for record in SeqIO.parse(handle, "fasta"):
                map = session.query(FamilySequence).filter(
                    and_(FamilySequence.family_key == family.id,
                         FamilySequence.sequence_key == int(
                             record.id))).first()
                if map == None:
                    map = FamilySequence()
                    map.family_key = family.id
                    map.sequence_key = int(record.id)
                    debug(map)
                    session.add(map)

        session.commit()
        session.close()

        #runtime().popd()
    finally:
        os.chdir(cwd)
コード例 #28
0
ファイル: oid.py プロジェクト: bsmithers/hpf
    def merge(self):
        from hpf.hddb.db import Session, Family

        self.session = Session()

        self.family = self.session.query(Family).filter(Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()
コード例 #29
0
ファイル: __init__.py プロジェクト: bsmithers/hpf
def sequences(cursor, sequence_key=None):
    """
    Return the given sequences as SeqRecord objects.
    """
    query = "SELECT id,sequence from sequence"
    if sequence_key != None:
        if not hasattr(sequence_key, "__iter__"):
            sequence_key = [sequence_key]
        query += " where id in (%s)" % (",".join([str(key) for key in sequence_key]))
    runtime().debug(query)
    cursor.execute(query)
    runtime().debug(query)
    for id, sequence in cursor.fetchall():
        yield SeqRecord(Seq(sequence), str(id))
コード例 #30
0
    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()
コード例 #31
0
ファイル: oid.py プロジェクト: bsmithers/hpf
    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree

        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree

        self.tree = Tree(
            alignment_key=self.alignment.id,
            text=self.nexus.to_string(plain=False, plain_newick=True),
            filename=self.treeFile,
        )
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory

        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            # session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory

        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()
コード例 #32
0
def merge(selection):
    selection.family_key = family
    selection.tree_key = tree
    #    try:
    #        s.sequence_key = int(record.id)
    #    except:
    #        seq = session.query(Sequence).filter(Sequence.sha1==sha1(str(record.seq).replace("-","")).hexdigest()).first()
    #        if seq:
    #            s.sequence_key = seq.id
    #        else:
    #            runtime().debug("Can't find sequence_key for",record.id, str(record.seq).replace("-",""))
    #            return
    selection = session.merge(selection)
    runtime().debug("Merged", selection)
    return selection
コード例 #33
0
ファイル: __init__.py プロジェクト: bsmithers/hpf
 def create(self, seed, targets, format="fasta", **kwargs):
     """
     Performs a default Mafft seed alignment using the seed and target
     @return: Seed alignment object
     """
     output = NamedTemporaryFile(**kwargs)
     with TemporaryAlignmentFile([seed], format=format, **kwargs) as seed_file:
         with TemporaryRecordFile(targets, format=format, **kwargs) as target_file:
             cmd = "mafft-linsi --quiet --seed %s %s > %s" % (seed_file,target_file, output.name)#"test.txt"
             runtime().debug(cmd)
             subprocess.check_call(cmd,shell=True)
     #print open(output.name).read()
     with open(output.name) as handle:
         alignment = AlignmentFactory(self._class).read(handle,format)
     return alignment
コード例 #34
0
ファイル: import_codeml.py プロジェクト: bsmithers/hpf
def merge(selection):
    selection.family_key = family
    selection.tree_key = tree
    #    try:
    #        s.sequence_key = int(record.id)
    #    except:
    #        seq = session.query(Sequence).filter(Sequence.sha1==sha1(str(record.seq).replace("-","")).hexdigest()).first()
    #        if seq:
    #            s.sequence_key = seq.id
    #        else:
    #            runtime().debug("Can't find sequence_key for",record.id, str(record.seq).replace("-",""))
    #            return
    selection = session.merge(selection)
    runtime().debug("Merged", selection)
    return selection
コード例 #35
0
ファイル: map_families.py プロジェクト: dpenfoldbrown/hpf
def blast(fasta,db="hpf_protein",force=False):
    """
    Map families to the database.
    """
    runtime().debug(fasta)
    file = os.path.abspath(fasta)
    base = ".".join(os.path.basename(file).split(".")[:-1])
    name = base.replace("_"," ")
    dir = base
    subprocess.call("mkdir -p %s"%base, shell=True)
    runtime().debug(dir,fasta,base)
    cwd = os.getcwd()
    os.chdir(dir)
    
    try:
        raise Exception("This has been modified like crazy, don't run as is, make sure this is correct")
        #runtime().pushd(dir)
        #formatted_fasta = FormatFastaTask(file,base+".fasta").run(force=force)
        # Cluster the families representatives before blasting everything to HPF
        #cdhit_fasta = CdhitTask(formatted_fasta,formatted_fasta+".cdhit",identity=0.7,length=0.7).run()
        #alignment = MuscleTask(formatted_fasta, base+".aln", clwstrict=True).run(force=force)
        #alignment = FormatAlignmentTask(alignment,base+".alnf").run(force=force)
        #blast_xml, blast_chk = InputAlignmentBlastTask(formatted_fasta,alignment, db="hpf_protein").run(force=force)
        #blast_matches = BlastParseXMLTask(blast_xml, base+".hpf.fasta", 0.8, expect=1e-6).run(force=force)
        #blast_matches = BlastTask(cdhit_fasta,alignment,base+".hpf.fasta").run(force=force)
        #graphics = DomainGraphicsTask(base,blast_matches,base+".svg","","").run(force=force)
        
        blast_matches = base+".hpf.fasta"
        
        session = Session()
        family = session.query(Family).filter(Family.name==name).one()
        debug(family)
        with open(blast_matches) as handle:
            for record in SeqIO.parse(handle, "fasta"):
                map = session.query(FamilySequence).filter(and_(FamilySequence.family_key==family.id, FamilySequence.sequence_key==int(record.id))).first()
                if map == None:
                    map = FamilySequence()
                    map.family_key=family.id
                    map.sequence_key = int(record.id)
                    debug(map)
                    session.add(map)
                
        session.commit()
        session.close()
            
        #runtime().popd()
    finally:
        os.chdir(cwd)
コード例 #36
0
def main(fam, t, *files):
    global family, tree, session
    session = Session()
    family = int(fam)
    tree = int(t)

    for file in files:
        #runtime().set_debug(1)
        runtime().debug("Using file", file)
        with open(file) as handle:
            ps = PositiveSelectionParser().parse(handle)
        count = consume(imap(merge, ps))
        runtime().debug("Found", count, "sites")

    session.commit()
    session.close()
コード例 #37
0
ファイル: export_xml.py プロジェクト: bsmithers/hpf
def xml(id):
    from hpf.hddb.db import Session, Family
    session = Session()
    family = session.query(Family).get(id)
    filename = "%i.xml" % family.id

    if runtime().opt(GZIP):
        import gzip
        filename = "%s.gz" % filename
        handle = gzip.open(filename,"w")
    else:
        handle = open(filename,"w")

    try:
        doc = FamilyFeatureBuilder(
            lambda: DefaultXMLGenerator(handle,pretty=True),
            lambda handler: StructureFeatureProvider(handler),
            lambda handler: ColumnFeatureProvider(handler),
            lambda handler: IeaFeatureProvider(handler),
            lambda handler: SelectionFeatureProvider(handler)
            )
        doc.buildDocument(family)
    finally:
        handle.close()
    session.close()
コード例 #38
0
ファイル: import_proteome.py プロジェクト: bsmithers/hpf
def add(record):
    record._digest = digest(record)
    z = session.query(Sequence).filter(Sequence.sha1==record._digest).first()
    if not z:
        z = Sequence(sequence = str(record.seq),
                     sha1= record._digest)
        session.add(z)
        session.flush()
        runtime().debug("Added sequence", record.id, z.id)
    else:
        runtime().debug("Found sequence", record.id, z.id)
    record._hddb = z
    record = ac(record)
    record = protein(record)
    record.id = str(record._ac.protein_key)
    return record
コード例 #39
0
def _do(argv):
    r = runtime()
    r.description("""
    template.py [-options] args
    template script.
    """)
    r.add_option(
        Flag(SYNCHRONOUS,
             "s",
             description=
             "Run this script synchronously without any multi-processing"))
    r.add_option(
        Flag(NEXUS, "n", description="The input files are in NEXUS format."))
    r.add_option(
        IntegerOption(
            MAX_SIZE,
            "l",
            description="Maximum number of leaves in the trees. Default:40",
            default=40))
    r.add_option(
        FileOption(
            DIRECTORY,
            "o",
            description="Directory to export to. Default to dirname(tree_file)",
            default=None))
    args = r.parse_options(argv)
    main(*args)
コード例 #40
0
def split(tree_file, size, nexus=False, dir=None):
    print file, size
    if nexus:
        tree = Nexus(tree_file).trees[0]
        tree2 = Nexus(tree_file).trees[0]
    else:
        with open(tree_file) as handle:
            tree_str = handle.read()
            tree = Tree(tree_str)
            tree2 = Tree(tree_str)
#    with open(align_file) as handle:
#        alignment = AlignIO.read(handle, "phylip")
    splitter = TreeSplitter(tree,
                            max_size=size,
                            annotater=UnrootedShortestPath)
    subs = list(splitter.subtrees())
    runtime().debug("Found", len(subs), subs)
    dir = dir if dir else os.path.dirname(tree_file)

    for i, tree in enumerate(subs):
        nodes = [tree.node(node) for node in tree.all_ids()]
        taxa = set(
            [node.data.taxon for node in nodes if node.data.taxon != None])
        for terminal in tree2.get_terminals():
            node = tree2.node(terminal)
            if node.data.taxon in taxa:
                node.data.taxon = "%i-" % i + node.data.taxon


#        sub_taxa = tree.get_taxa()
#        sub_alignment = Alignment(alphabet=alignment._alphabet)
#        sub_alignment._records = [r for r in alignment._records if r.id in sub_taxa]
#        assert len(sub_taxa)==len(sub_alignment._records)
##        align_out = "%s.%i" % (os.path.join(dir,os.path.basename(align_file)),i)
#        with open(align_out,"w") as handle:
#            AlignIO.write([sub_alignment], handle, "phylip")
#        from hpf.phylip import interleave
#        interleave(align_out)
        with open(
                "%s.%i" % (os.path.join(dir, os.path.basename(tree_file)), i),
                "w") as handle:
            print >> handle, tree.to_string(plain_newick=True,
                                            branchlengths_only=False) + ";"
    with open("%s.annotated" % os.path.join(dir, os.path.basename(tree_file)),
              "w") as handle:
        print >> handle, tree2.to_string(plain_newick=True,
                                         branchlengths_only=False) + ";"
コード例 #41
0
def _do(argv):
    r = runtime()
    r.description("""
    map_fasta.py [-options] fasta
    Rename all of the records with sequence id's from the database.
    """)
    args = r.parse_options(argv)
    main(*args)
コード例 #42
0
ファイル: import_proteome.py プロジェクト: bsmithers/hpf
def _do(argv):
    r = runtime()
    r.description("""
    import_proteome.py [-options] fasta experiment_id
    template script.
    """)
    args = r.parse_options(argv)
    main(*args)
コード例 #43
0
    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree
        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree
        self.tree = Tree(alignment_key=self.alignment.id,
                         text=self.nexus.to_string(plain=False,
                                                   plain_newick=True),
                         filename=self.treeFile)
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory
        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            #session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory
        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(
            self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()
コード例 #44
0
 def create(self, seed, targets, format="fasta", **kwargs):
     """
     Performs a default Mafft seed alignment using the seed and target
     @return: Seed alignment object
     """
     output = NamedTemporaryFile(**kwargs)
     with TemporaryAlignmentFile([seed], format=format,
                                 **kwargs) as seed_file:
         with TemporaryRecordFile(targets, format=format,
                                  **kwargs) as target_file:
             cmd = "mafft-linsi --quiet --seed %s %s > %s" % (
                 seed_file, target_file, output.name)  #"test.txt"
             runtime().debug(cmd)
             subprocess.check_call(cmd, shell=True)
     #print open(output.name).read()
     with open(output.name) as handle:
         alignment = AlignmentFactory(self._class).read(handle, format)
     return alignment
コード例 #45
0
def _do(argv):
    r = runtime()
    r.description("""
    oid.py [-options] *oid-directory
    Import an OID family.
    """)
    #r.add_option(Flag(SYNCHRONOUS, "s", description="Run this script synchronously without any multi-processing"))
    args = r.parse_options(argv)
    main(*args)
コード例 #46
0
def merge(record):
    s = FamilySequence()
    s.family_key = family
    s.sequence_key = int(record.id)
    #    try:
    #        s.sequence_key = int(record.id)
    #    except:
    #        seq = session.query(Sequence).filter(Sequence.sha1==sha1(str(record.seq).replace("-","")).hexdigest()).first()
    #        if seq:
    #            s.sequence_key = seq.id
    #        else:
    #            runtime().debug("Can't find sequence_key for",record.id, str(record.seq).replace("-",""))
    #            return
    s.alignment = str(record.seq)
    s.seed = True
    s = session.merge(s)
    runtime().debug("Merged", s.sequence_key, s.family_key, s.alignment)
    return s
コード例 #47
0
def _do(argv):
    r = runtime()
    r.description("""
    import_codeml.py [-options] familykey treekey *codeml_output
    Import codeml positive selection output.
    """)
    #r.add_option(Option(FORMAT, "f", description="Alignment Format", default="fasta"))
    args = r.parse_options(argv)
    main(*args)
コード例 #48
0
 def blast(self, fasta, output):
     """
     Blast the fasta, consume the output buffer, return the output filename
     """
     runtime().debug("Blasting %s with alignment %s using %s" %
                     (fasta, self.alignment, self.blast_exe))
     r, e = NCBIStandalone.blastpgp(self.blast_exe,
                                    self.db,
                                    fasta,
                                    align_infile=self.alignment,
                                    align_outfile=output,
                                    expectation=self.expect,
                                    model_threshold=self.expect,
                                    npasses=3,
                                    nprocessors=1,
                                    **self.kwargs)
     consume(r)
     return output
コード例 #49
0
ファイル: import_proteome.py プロジェクト: bsmithers/hpf
def main(file, experiment_id):
    global fasta, session, experiment
    
    #runtime().set_debug(1)
    runtime().debug("Using file",file)
    session = Session()
    fasta = os.path.basename(file)
    
    experiment = session.query(Experiment).get(experiment_id)
    with open(file) as handle:
        records = list(SeqIO.parse(handle,"fasta"))
    runtime().debug("Found %i records" % len(records))
    runtime().indent()
    records = map(add,records)
    session.close()
    runtime().unindent()
    runtime().debug("Writing %i records" % len(records))
    with open(file+".hpf","w") as handle:
        SeqIO.write(records, handle, "fasta")
コード例 #50
0
def _do(argv):
    r = runtime()
    r.description("""
    import_alignment.py [-options] fasta family
    Import an alignment and attach to a family.
    """)
    r.add_option(
        Option(FORMAT, "f", description="Alignment Format", default="fasta"))
    args = r.parse_options(argv)
    main(*args)
コード例 #51
0
ファイル: import_proteome.py プロジェクト: bsmithers/hpf
def ac(record):
    acc = session.query(SequenceAc).filter(and_(SequenceAc.sequence_key==record._hddb.id,
                                               SequenceAc.db=='amnh',
                                               SequenceAc.ac==record.id)).first()
    if not acc:
        acc = SequenceAc(sequence_key=record._hddb.id,
                        gi = None,
                        db = 'amnh',
                        ac = record.id,
                        ac2 = None,
                        description = fasta,
                        taxonomy_id = experiment.taxonomy_id)
                        
        session.add(acc)
        session.flush()
        runtime().debug("Added ac", acc)
    else:
        runtime().debug("Found ac", acc)
    
    record._ac = acc
    return record
コード例 #52
0
ファイル: import_proteome.py プロジェクト: bsmithers/hpf
def protein(record):
    if record._ac.protein_key == None:
        protein = Protein(experiment_key=experiment.id,
                          protein_type='amnh',
                          sequence_key=record._hddb.id,
                          probability=0,
                          comment="AMNH protein families",
                          file_key=0,
                          parse_key=0,
                          gene_key=0,
                          insert_data=func.CURRENT_DATE()
                          )
        session.add(protein)
        session.flush()
        record._ac.protein_key = protein.id
#        record._ac = session.merge(record._ac)
#        record._ac.protein_key = protein.id
        runtime().debug("Modified",session.is_modified(record._ac))
        session.add(record._ac)
        record._protein = protein
        runtime().debug("Added protein",record._protein)
    else:
        record._protein = session.query(Protein).get(record._ac.protein_key)
        runtime().debug("Found protein key",record._protein)
    return record
コード例 #53
0
ファイル: import_tree.py プロジェクト: bsmithers/hpf
def _do(argv):
    r = runtime()
    r.description("""
    import_tree.py [-options] tree_file family_key
    Import a tree into the database.
    """)
    r.add_option(
        Flag(SYNCHRONOUS,
             "s",
             description=
             "Run this script synchronously without any multi-processing"))
    args = r.parse_options(argv)
    main(*args)
コード例 #54
0
def _do(argv):
    r = runtime()
    r.description("""
    template.py [-options] args
    template script.
    """)
    r.add_option(
        Flag(SYNCHRONOUS,
             "s",
             description=
             "Run this script synchronously without any multi-processing"))
    args = r.parse_options(argv)
    main(*args)
コード例 #55
0
def _do(argv):
    r = runtime()
    r.description("""
    export_xml.py [-options] args
    Exports full structure/family reports for families to xml files. 
    """)
    r.add_option(
        Flag(SYNCHRONOUS,
             "s",
             description=
             "Run this script synchronously without any multi-processing"))
    r.add_option(Flag(GZIP, "z", description="GZip the resulting files."))
    args = r.parse_options(argv)
    main(*args)
コード例 #56
0
ファイル: _utilities.py プロジェクト: bsmithers/hpf
 def run(self,force=False):
     output = self.output
     if not hasattr(output,'__iter__'):
         output = [output]
     which = [os.path.exists(f) for f in output]
     if not all(which) or force==True:
         try:
             r = self._do()
             return r if r != None else self.output
         except:
             #print >>sys.stderr, "failure. deleting ",self.output
             delete = [self.output] if isinstance(self.output, str) else self.output 
             for file in delete:
                 try:
                     if os.path.exists(file):
                         os.remove(file)
                 except:
                     print >>sys.stderr, "cannot remove file ",file
                     pass
             raise
     else:
         runtime().debug("Output exists, skipping",[f for i,f in enumerate(output) if which[i]==True])
         return self._exists()
コード例 #57
0
def _do(argv):
    r = runtime()
    r.description("""
    families.py [-options] args
    Blast protein famlies against HPF database and gather representatives.
    """)
    r.add_option(
        Flag(SYNCHRONOUS,
             "s",
             description=
             "Run this script synchronously without any multi-processing"))
    r.add_option(Flag(FORCE, "f",
                      description="Force re-running of all tasks."))
    args = r.parse_options(argv)
    main(*args)
コード例 #58
0
def main(*args):
    pool = processor(synchronous=runtime().opt(SYNCHRONOUS))
    runtime().debug("Using processor", pool)
    pool.make_tasks(lambda: zip(
        args,
        #                                [args[i] for i in range(0,len(args),2)],
        #                                [args[i] for i in range(1,len(args),2)],
        repeat(runtime().opt(MAX_SIZE), len(args)),
        repeat(runtime().opt(NEXUS), len(args)),
        repeat(runtime().opt(DIRECTORY), len(args))))
    consume(pool.run(_split))