def initialize():
    global dataSavePath, post_list_file_name, img_list_file_name, comments_list_file_name, repeat

    dataSavePath = osPathJoin(osGetcwd(), 'CrawlerDataFile')
    post_list_file_name = "post_list.json"
    img_list_file_name = "img_list.json"
    comments_list_file_name = "comments_list.json"
    repeat = False
Example #2
0
def blastall_seq2db(header,
                    sequence,
                    dbname="",
                    blastprogram="blastp",
                    output="ncbiparsed",
                    extra_blastp_params={
                        'F': 'F',
                        'e': '10'
                    }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join(
        [uniquetag,
         str(header).replace(" ", "_"), sequence[0:10] + ".fa"])
    fname = osPathJoin(OSgetcwd(), fname)
    fh = open(fname, 'w')
    fh.write(">%s\n%s\n" % (header, sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Example #3
0
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}):
    """
    """
    if blastprogram not in ['blastp','tblastn','blastn','blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] )
    fname = osPathJoin(OSgetcwd(),fname)
    fh = open(fname,'w')
    fh.write(">%s\n%s\n" % (header,sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname)
    try:
        ci,co,ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Example #4
0
def abgpsysexit(input,OPTIONS,message=""):
    """ """
    if input.has_key(OPTIONS.target):
        key = OPTIONS.target
    elif OPTIONS.target == None:
        key = None    
    else:
        # find by protein fref
        for k in input.keys():
            if input[k]['gldobj'].protein_fref() == OPTIONS.target:
                key = k
                break
        else:
            key = None

    # get filename to write to
    if key:
        fname = "%s.bailout.log" % input[key]['gldobj'].protein_fref()
    elif key == None and OPTIONS.filewithloci:
        hdr = OPTIONS.filewithloci.split("/")[-1].split(".")[0]
        fname = "%s.bailout.log" % hdr 
    elif key == None and OPTIONS.dirwithloci:
        hdr = OPTIONS.dirwithloci.split("/")[-1].split(".")[0]
        fname = "%s.bailout.log" % hdr
    else:
        fname = "%s.bailout.log" % ("Notargetapplied") 

    # clean the complete directory from all files
    _file_cleanup( osListdir(OPTIONS.outdir), include_directories=True )
        
    fname = osPathJoin(OPTIONS.outdir,fname)
    fh = open(fname,'w')
    fh.write(message+"\n")
    fh.close()
    # safely break out of the ABGP algorithm
    sysExit()
Example #5
0
def HistoryDataLoad():
    post_list_history = []
    # img_list_history = []
    # comments_list_history = []

    if FilePathGlobals.repeat == False:
        post_list_path = osPathJoin(FilePathGlobals.dataSavePath,
                                    FilePathGlobals.post_list_file_name)

        # img_list_path = osPathJoin(FilePathGlobals.dataSavePath,
        #                            FilePathGlobals.img_list_file_name)

        # comments_list_path = osPathJoin(FilePathGlobals.dataSavePath,
        #                                 FilePathGlobals.comments_list_file_name)

        # 確認檔案是否存在
        if CheckFile(post_list_path):
            with open(post_list_path,
                      encoding='utf-8',
                      errors='ignore') as f:
                post_list_history = json.load(f, strict=False)

        # # 確認檔案是否存在
        # if CheckFile(img_list_path):
        #     with open(img_list_path,
        #               encoding='utf-8',
        #               errors='ignore') as f:
        #         img_list_history = json.load(f, strict=False)

        #         # 確認檔案是否存在
        # if CheckFile(comments_list_path):
        #     with open(comments_list_path,
        #               encoding='utf-8',
        #               errors='ignore') as f:
        #         comments_list_history = json.load(f, strict=False)
    return post_list_history
Example #6
0
def abgpsysexit(input, OPTIONS, message=""):
    """ """
    if input.has_key(OPTIONS.target):
        key = OPTIONS.target
    elif OPTIONS.target == None:
        key = None
    else:
        # find by protein fref
        for k in input.keys():
            if input[k]['gldobj'].protein_fref() == OPTIONS.target:
                key = k
                break
        else:
            key = None

    # get filename to write to
    if key:
        fname = "%s.bailout.log" % input[key]['gldobj'].protein_fref()
    elif key == None and OPTIONS.filewithloci:
        hdr = OPTIONS.filewithloci.split("/")[-1].split(".")[0]
        fname = "%s.bailout.log" % hdr
    elif key == None and OPTIONS.dirwithloci:
        hdr = OPTIONS.dirwithloci.split("/")[-1].split(".")[0]
        fname = "%s.bailout.log" % hdr
    else:
        fname = "%s.bailout.log" % ("Notargetapplied")

    # clean the complete directory from all files
    _file_cleanup(osListdir(OPTIONS.outdir), include_directories=True)

    fname = osPathJoin(OPTIONS.outdir, fname)
    fh = open(fname, 'w')
    fh.write(message + "\n")
    fh.close()
    # safely break out of the ABGP algorithm
    sysExit()
Example #7
0
"""
Alignment Based Gene Predictions settings: settings for (PSSM) Translational
Start Sites (TSS)
"""
from settings.abgp import MAIN_ABGP_PATH
from os.path import join as osPathJoin


################################################################################
# PSSM_IC file for Translational Start Sites
################################################################################
IC_TSS_DATA_FILE       = osPathJoin(MAIN_ABGP_PATH,"datafiles/ic_start_5fungi.txt")
IC_TSS_PATTERN_OFFSET  = (9,7)

################################################################################
# Threshold values for TranslationalStartSite (ATG) PSSM cutoffs
# Recommended to set spatiously low!
# Non-cannonical splice sites are not supported yet
################################################################################
TSS_MIN_PSSM_SCORE                  = float(-1)
TSS_ALLOW_NON_CANONICAL             = False
TSS_NON_CANONICAL_MIN_PSSM_SCORE    = float(0)


################################################################################
# Threshold for defining if a TSS is optimal in a given range
################################################################################
TCODE_TSS_5P_WINDOW                 = 201     # nt coords
TCODE_TSS_3P_WINDOW                 = 201     # nt coords
TCODE_AVERAGE_SCORE                 = 0.845
Example #8
0
    def __init__(self): pass
# end of class AcceptorPSSMOptions


################################################################################
# Which (Non)canonical splice sites are recognized/accepted?
################################################################################
CANONICAL_DONOR_SITES     = ['GT']
NON_CANONICAL_DONOR_SITES = ['GC']
CANONICAL_ACCEPTOR_SITES  = ['AG']


################################################################################
# PSSM_IC files for donors & acceptors
################################################################################
IC_DONOR_DATA_FILE         = osPathJoin(MAIN_ABGP_PATH,
                                        "datafiles/ic_donor_5fungi.txt")
IC_DONOR_PATTERN_OFFSET    = (3,4)
IC_ACCEPTOR_DATA_FILE      = osPathJoin(MAIN_ABGP_PATH,
                                        "datafiles/ic_acceptor_5fungi.txt")
IC_ACCEPTOR_PATTERN_OFFSET = (6,3)
IC_DONOR_NCGC_DATA_FILE    = osPathJoin(MAIN_ABGP_PATH,
                                        "datafiles/ic_ncdonorgc_5fungi.txt")

################################################################################
# Threshold values for donor site PSSM cutoffs
# Recommended to set spatiously low!
# Non-cannonical donor sites are supported yet, but disabled by default
################################################################################
MIN_DONOR_PSSM_SCORE                    = 0.0 
ALLOW_NON_CANONICAL_DONOR               = True
NON_CANONICAL_MIN_DONOR_PSSM_SCORE      = float(3.0)
Example #9
0
def CrawlerAction(days):
    try:
        # 初始化全域變數
        FilePathGlobals.initialize()

        # 取得指定天數的 post list 資料,存成陣列
        post_list_temp = get_post_list(days)
        # print("\n\n post_list:", post_list)

        # 從陣列裡面把每個 url 丟進 post_crawler 爬取 po 文內容,回傳 post_time, author, comments, img_name_list, img_url_list
        post_list = []
        # img_list = []
        # comments_list = []
        for index, post in enumerate(post_list_temp):
            post_time, author, comments, img_name_list, img_url_list = post_crawler(
                post['url'], index)

            post_list_temp[index]['post_time'] = post_time
            post_list_temp[index]['author'] = author
            post_list_temp[index]['imgs'] = img_url_list
            post_list_temp[index]['comments'] = comments
            post_list.append(post_list_temp[index])

            # # img_list
            # for imgUrl in img_url_list:
            #     imgs_dict = {}
            #     imgs_dict['post_index'] = postIndex
            #     # print(post_time, author, img_name_list, img_url_list)
            #     imgs_dict['img_url'] = imgUrl
            #     img_list.append(imgs_dict)

            # #  comments_list
            # for comment_info in comments:
            #     comments_dict = {}
            #     comments_dict = comment_info
            #     comments_dict['post_index'] = postIndex
            #     comments_list.append(comments_dict)

            # 顯示每一頁的資料
            print(post_list[index]['title'])
            print(post_list[index]['url'], '\n')

        # 確認保存資料夾是否存在
        CheckDir(FilePathGlobals.dataSavePath)

        # 從歷史數據中剔除重複資料
        from Model.Crawler.HistoryData.JsonHistoryDataModel import HistoryDataLoad, UpdateRepeatData
        post_list_history = HistoryDataLoad()
        post_list, post_list_history = UpdateRepeatData(
            post_list, post_list_history)
        post_list.extend(post_list_history)

        # 存成字典post_list
        with open(osPathJoin(FilePathGlobals.dataSavePath,
                             FilePathGlobals.post_list_file_name),
                  'w',
                  encoding="utf-8") as file:
            json.dump(post_list, file, ensure_ascii=False)

        # # img_list
        # with open(osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.img_list_file_name),
        #           'w', encoding="utf-8") as file:
        #     json.dump(img_list, file, ensure_ascii=False)

        # # comments_list
        # with open(osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.comments_list_file_name),
        #           'w', encoding="utf-8") as file:
        #     json.dump(comments_list, file, ensure_ascii=False)

    except Exception as e:
        from traceback import format_exc
        print(format_exc())
Example #10
0
def createblastdbs(input,GSG,OPTIONS,dbfraction=None,organism=None,acceptorfids=[],rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org!=organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], [] 
  
        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag,org)
        fullpath = osPathJoin(OPTIONS.outdir,fname)
        fh = open(fullpath,'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end   = min(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,orflist=orflist,header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org]['orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(orf.tofasta(header="%s_orf_%s" % (org,orf.id))+"\n")
                # increase seqsindb[org] counter
                seqsindb[org]+=1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
#     >85              BLOSUM-62               (11,1)
#
##############################################################################
#
# blosum62 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EBLOSUM62")
# pam30 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EPAM30")
#
##############################################################################

# Python Imports
from os.path import join as osPathJoin

# import paths to BLOSUM and PAM matrices
from executables import EMBOSS_DATA_DIRECTORY

BLOSUM45_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM45")
BLOSUM62_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM62")
BLOSUM80_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM80")
PAM30_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM30")
PAM70_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM70")


def parse_emboss_matrix_file(fname):
    """
    Parse an EMBOSS protein similarity matrix file
    Supported types are BLOSUM and PAM matrices
    """
    lines = []
    matrix = {}
    aas = []
    for line in open(fname).readlines():
Example #12
0
# python
PYTHON_PATH = "/usr/bin/python2.6"  # tested are 2.4, 2.6 and 2.7.4
PYTHON_VERSION = "python2.6.2"
# emboss
EMBOSS_EXECUTABLES_PATH = "/usr/bin"
EMBOSS_DATA_DIRECTORY = "/usr/share/EMBOSS/data"
EMBOSS_VERSION = "4.0.0"
# perl
PERL_PATH = "perl"  # only required in case a MySQL GGB database is
# coupled to store and visualize ABFGP output.
PERL_VERSION = "?"  # not used at all.

################################################################################
# Full paths to executables
################################################################################
EXECUTABLE_BLASTALL = osPathJoin(BASEPATH, "software/blast-2.2.8/blastall")
EXECUTABLE_FORMATDB = osPathJoin(BASEPATH, "software/blast-2.2.8/formatdb")
EXECUTABLE_GETORF = osPathJoin(EMBOSS_EXECUTABLES_PATH, "getorf")
EXECUTABLE_TCODE = osPathJoin(EMBOSS_EXECUTABLES_PATH, "tcode")
EXECUTABLE_TRANSEQ = osPathJoin(EMBOSS_EXECUTABLES_PATH, "transeq")
EXECUTABLE_CLUSTALW = osPathJoin(BASEPATH, "software/clustalw-1.83/clustalw")
EXECUTABLE_SIGNALP = osPathJoin(BASEPATH, "software/signalp-3.0/signalp")
EXECUTABLE_TMHMM = osPathJoin(BASEPATH, "software/tmhmm-2.0/bin/tmhmm")
EXECUTABLE_SFM = osPathJoin(BASEPATH,
                            "software/ScanForMatches/scan_for_matches")
EXECUTABLE_HMMPATH = osPathJoin(BASEPATH, "software/hmmer-2.3.2")
EXECUTABLE_HMMSEARCH = osPathJoin(EXECUTABLE_HMMPATH, "hmmsearch")
EXECUTABLE_HMMBUILD = osPathJoin(EXECUTABLE_HMMPATH, "hmmbuild")

EXECUTABLE_CEXPANDER_PATH = osPathJoin(BASEPATH, "software/cexpander-1.0")
EXECUTABLE_CEXPANDER_ALLVSALL = osPathJoin(EXECUTABLE_CEXPANDER_PATH,
Example #13
0
# Module metadata
__authors__ = "Ate van der Burgt"
__license__ = "MIT"

# Python imports
import gzip
from subprocess import Popen, PIPE
from os.path import (
    join as osPathJoin,
    abspath as osPathAbspath,
    dirname as osPathDirname,
    )

# absolute paths to executables located within the same directory
EXECUTABLE_ISSINGLEFASTADNA     = osPathJoin(osPathDirname(osPathAbspath(__file__)),"issinglefastadna.sh")
EXECUTABLE_ISSINGLEFASTAPROTEIN = osPathJoin(osPathDirname(osPathAbspath(__file__)),"issinglefastaprotein.sh")

########################################################################
#### Exceptions
########################################################################

class NoSingleFastaFile(Exception):
    pass

########################################################################
#### Validation functions
########################################################################

def IsSingleFastaDna(fname,executable=EXECUTABLE_ISSINGLEFASTADNA):
    """
#     >85              BLOSUM-62               (11,1)
#
##############################################################################
#
# blosum62 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EBLOSUM62")
# pam30 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EPAM30")
#
##############################################################################

# Python Imports
from os.path import join as osPathJoin

# import paths to BLOSUM and PAM matrices
from executables import EMBOSS_DATA_DIRECTORY

BLOSUM45_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM45")
BLOSUM62_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM62")
BLOSUM80_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM80")
PAM30_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM30")
PAM70_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM70")


def parse_emboss_matrix_file(fname):
    """
    Parse an EMBOSS protein similarity matrix file
    Supported types are BLOSUM and PAM matrices
    """
    lines = []
    matrix = {}
    aas = []
    for line in open(fname).readlines():
Example #15
0
def favicon():
    return send_from_directory(osPathJoin(webShellRootPath, 'static'),
                               'images/favicon.ico',
                               mimetype='image/vnd.microsoft.icon')
Example #16
0
def robots():
    return send_from_directory(osPathJoin(webShellRootPath, 'static'),
                               'robots.txt',
                               mimetype='text/plain')
Example #17
0
# python
PYTHON_PATH             = "/usr/bin/python2.6" # tested are 2.4, 2.6 and 2.7.4
PYTHON_VERSION          = "python2.6.2"
# emboss
EMBOSS_EXECUTABLES_PATH = "/usr/bin"
EMBOSS_DATA_DIRECTORY   = "/usr/share/EMBOSS/data"
EMBOSS_VERSION          = "4.0.0"
# perl
PERL_PATH               = "perl"    # only required in case a MySQL GGB database is
                                    # coupled to store and visualize ABFGP output.
PERL_VERSION            = "?"       # not used at all.

################################################################################
# Full paths to executables
################################################################################
EXECUTABLE_BLASTALL     = osPathJoin(BASEPATH,"software/blast-2.2.8/blastall")
EXECUTABLE_FORMATDB     = osPathJoin(BASEPATH,"software/blast-2.2.8/formatdb")
EXECUTABLE_GETORF       = osPathJoin(EMBOSS_EXECUTABLES_PATH,"getorf")
EXECUTABLE_TCODE        = osPathJoin(EMBOSS_EXECUTABLES_PATH,"tcode")
EXECUTABLE_TRANSEQ      = osPathJoin(EMBOSS_EXECUTABLES_PATH,"transeq")
EXECUTABLE_CLUSTALW     = osPathJoin(BASEPATH,"software/clustalw-1.83/clustalw")
EXECUTABLE_SIGNALP      = osPathJoin(BASEPATH,"software/signalp-3.0/signalp")
EXECUTABLE_TMHMM        = osPathJoin(BASEPATH,"software/tmhmm-2.0/bin/tmhmm")
EXECUTABLE_SFM          = osPathJoin(BASEPATH,"software/ScanForMatches/scan_for_matches")
EXECUTABLE_HMMPATH      = osPathJoin(BASEPATH,"software/hmmer-2.3.2")
EXECUTABLE_HMMSEARCH    = osPathJoin(EXECUTABLE_HMMPATH,"hmmsearch")
EXECUTABLE_HMMBUILD     = osPathJoin(EXECUTABLE_HMMPATH,"hmmbuild")

EXECUTABLE_CEXPANDER_PATH       = osPathJoin(BASEPATH,"software/cexpander-1.0")
EXECUTABLE_CEXPANDER_ALLVSALL   = osPathJoin(EXECUTABLE_CEXPANDER_PATH,
                                             "prep_launch.py")
Example #18
0
# Python imports that might be absent depending on the version of Python
from sets import Set  # deprecated but backwards compatible in newer verions of python
from optparse import OptionParser
from optparse import OptionGroup
from optparse import OptionValueError
from StringIO import StringIO

# Most likely Numeric is not installed; this is solved by having it installed
# within the ABFGP code tree
# The path to Numeric is obtained either from the settings dir (absolute)
# or the path is reconstructed (relative)

# absolute path acquisition
from settings.abgp import MAIN_ABGP_PATH as BASEPATH

sys.path.append(osPathJoin(BASEPATH, "requiredmodules"))
from Numeric import zeros, where, greater, greater_equal

# relative path acquisition
sys.path.append(osPathJoin(osPathDirname(osPathAbspath(__file__)), "requiredmodules"))
from Numeric import zeros, where, greater, greater_equal

from abgp_etc import abgpsysexit
from abgp_etc import _blastdb_cleanup
from abgp_etc import _blastdb_cleanup, _file_cleanup
from abgp_exceptions import InproperlyAppliedArgument
from abgp_exceptions import NoCrossdataApplied, NoInputApplied
from abgp_geneconfirmation import *
from abgp_geneconfirmation import geneconfirmation
from abgpgenelocusdirectory import AbgpGeneLocusDirectory
from abgpgenelocusdirectory import IsAbgpGeneLocusDirectory
Example #19
0
"""
Alignment Based Gene Predictions settings: EMBOSS data files 
"""

from os.path import join as osPathJoin
from abgp import MAIN_ABGP_PATH

################################################################################
# Full paths to Amino Acid similarity matrices
################################################################################
BLOSUM62_PATH   = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM62")
BLOSUM80_PATH   = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM80")
BLOSUM45_PATH   = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM45")
PAM30_PATH      = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EPAM30")
PAM70_PATH      = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EPAM70")

Example #20
0
"""
Alignment Based Gene Predictions settings: settings for (PSSM) Translational
Start Sites (TSS)
"""
from settings.abgp import MAIN_ABGP_PATH
from os.path import join as osPathJoin

################################################################################
# PSSM_IC file for Translational Start Sites
################################################################################
IC_TSS_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_start_5fungi.txt")
IC_TSS_PATTERN_OFFSET = (9, 7)

################################################################################
# Threshold values for TranslationalStartSite (ATG) PSSM cutoffs
# Recommended to set spatiously low!
# Non-cannonical splice sites are not supported yet
################################################################################
TSS_MIN_PSSM_SCORE = float(-1)
TSS_ALLOW_NON_CANONICAL = False
TSS_NON_CANONICAL_MIN_PSSM_SCORE = float(0)

################################################################################
# Threshold for defining if a TSS is optimal in a given range
################################################################################
TCODE_TSS_5P_WINDOW = 201  # nt coords
TCODE_TSS_3P_WINDOW = 201  # nt coords
TCODE_AVERAGE_SCORE = 0.845

TSS_IS_OPTIMAL_5P_WINDOW = 200  # nt coords, was 250 in the juli2009 abgp_tssanalyses test
TSS_IS_OPTIMAL_3P_WINDOW = 100  # nt coords
Example #21
0
################################################################################
### PssmObjectCollectionGraph class and inheriting classes                  ####
################################################################################

# Python imports
from sets import Set

# Make sure Numeric is installed somewhere!!!
# For convenience, it is installed in ./requiredmodules
import sys
from os.path import join as osPathJoin
from settings.abgp import MAIN_ABGP_PATH as BASEPATH
sys.path.append(osPathJoin(BASEPATH,"requiredmodules"))
from Numeric import zeros

# graphAbgp imports
from graph_organism import OrganismGraph
from subclass_sitealignment import BasalSiteAlignmentFunctions, sort_by_cumulative_score
from subclass_pssmobjects import BasalPSSMObjectGraphFunctions
import conversion
import ordering
import graphPlus
from exceptions import *


# Global Varibale Imports
from settings.translationalstartsites import TCODE_TSS_5P_WINDOW, TCODE_TSS_3P_WINDOW

class PssmObjectCollectionGraph(OrganismGraph,BasalSiteAlignmentFunctions,BasalPSSMObjectGraphFunctions):
    """
    """
Example #22
0
# Python imports that might be absent depending on the version of Python
from sets import Set  # deprecated but backwards compatible in newer verions of python
from optparse import OptionParser
from optparse import OptionGroup
from optparse import OptionValueError
from StringIO import StringIO

# Most likely Numeric is not installed; this is solved by having it installed
# within the ABFGP code tree
# The path to Numeric is obtained either from the settings dir (absolute)
# or the path is reconstructed (relative)

# absolute path acquisition
from settings.abgp import MAIN_ABGP_PATH as BASEPATH
sys.path.append(osPathJoin(BASEPATH, "requiredmodules"))
from Numeric import zeros, where, greater, greater_equal

# relative path acquisition
sys.path.append(
    osPathJoin(osPathDirname(osPathAbspath(__file__)), "requiredmodules"))
from Numeric import zeros, where, greater, greater_equal

from abgp_etc import abgpsysexit
from abgp_etc import _blastdb_cleanup
from abgp_etc import _blastdb_cleanup, _file_cleanup
from abgp_exceptions import InproperlyAppliedArgument
from abgp_exceptions import NoCrossdataApplied, NoInputApplied
from abgp_geneconfirmation import *
from abgp_geneconfirmation import geneconfirmation
from abgpgenelocusdirectory import AbgpGeneLocusDirectory
Example #23
0
"""
Alignment Based Gene Predictions settings: EMBOSS data files 
"""

from os.path import join as osPathJoin
from abgp import MAIN_ABGP_PATH

################################################################################
# Full paths to Amino Acid similarity matrices
################################################################################
BLOSUM62_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM62")
BLOSUM80_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM80")
BLOSUM45_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM45")
PAM30_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EPAM30")
PAM70_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EPAM70")
Example #24
0
def createblastdbs(input,
                   GSG,
                   OPTIONS,
                   dbfraction=None,
                   organism=None,
                   acceptorfids=[],
                   rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org != organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], []

        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag, org)
        fullpath = osPathJoin(OPTIONS.outdir, fname)
        fh = open(fullpath, 'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end = min(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    acceptorfids=acceptorfids, rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,
                                                        orflist=orflist,
                                                        header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org][
                        'orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(
                    orf.tofasta(header="%s_orf_%s" % (org, orf.id)) + "\n")
                # increase seqsindb[org] counter
                seqsindb[org] += 1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
Example #25
0
        pass


# end of class AcceptorPSSMOptions

################################################################################
# Which (Non)canonical splice sites are recognized/accepted?
################################################################################
CANONICAL_DONOR_SITES = ['GT']
NON_CANONICAL_DONOR_SITES = ['GC']
CANONICAL_ACCEPTOR_SITES = ['AG']

################################################################################
# PSSM_IC files for donors & acceptors
################################################################################
IC_DONOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH,
                                "datafiles/ic_donor_5fungi.txt")
IC_DONOR_PATTERN_OFFSET = (3, 4)
IC_ACCEPTOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH,
                                   "datafiles/ic_acceptor_5fungi.txt")
IC_ACCEPTOR_PATTERN_OFFSET = (6, 3)
IC_DONOR_NCGC_DATA_FILE = osPathJoin(MAIN_ABGP_PATH,
                                     "datafiles/ic_ncdonorgc_5fungi.txt")

################################################################################
# Threshold values for donor site PSSM cutoffs
# Recommended to set spatiously low!
# Non-cannonical donor sites are supported yet, but disabled by default
################################################################################
MIN_DONOR_PSSM_SCORE = 0.0
ALLOW_NON_CANONICAL_DONOR = True
NON_CANONICAL_MIN_DONOR_PSSM_SCORE = float(3.0)