def initialize(): global dataSavePath, post_list_file_name, img_list_file_name, comments_list_file_name, repeat dataSavePath = osPathJoin(osGetcwd(), 'CrawlerDataFile') post_list_file_name = "post_list.json" img_list_file_name = "img_list.json" comments_list_file_name = "comments_list.json" repeat = False
def blastall_seq2db(header, sequence, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [uniquetag, str(header).replace(" ", "_"), sequence[0:10] + ".fa"]) fname = osPathJoin(OSgetcwd(), fname) fh = open(fname, 'w') fh.write(">%s\n%s\n" % (header, sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}): """ """ if blastprogram not in ['blastp','tblastn','blastn','blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] ) fname = osPathJoin(OSgetcwd(),fname) fh = open(fname,'w') fh.write(">%s\n%s\n" % (header,sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname) try: ci,co,ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def abgpsysexit(input,OPTIONS,message=""): """ """ if input.has_key(OPTIONS.target): key = OPTIONS.target elif OPTIONS.target == None: key = None else: # find by protein fref for k in input.keys(): if input[k]['gldobj'].protein_fref() == OPTIONS.target: key = k break else: key = None # get filename to write to if key: fname = "%s.bailout.log" % input[key]['gldobj'].protein_fref() elif key == None and OPTIONS.filewithloci: hdr = OPTIONS.filewithloci.split("/")[-1].split(".")[0] fname = "%s.bailout.log" % hdr elif key == None and OPTIONS.dirwithloci: hdr = OPTIONS.dirwithloci.split("/")[-1].split(".")[0] fname = "%s.bailout.log" % hdr else: fname = "%s.bailout.log" % ("Notargetapplied") # clean the complete directory from all files _file_cleanup( osListdir(OPTIONS.outdir), include_directories=True ) fname = osPathJoin(OPTIONS.outdir,fname) fh = open(fname,'w') fh.write(message+"\n") fh.close() # safely break out of the ABGP algorithm sysExit()
def HistoryDataLoad(): post_list_history = [] # img_list_history = [] # comments_list_history = [] if FilePathGlobals.repeat == False: post_list_path = osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.post_list_file_name) # img_list_path = osPathJoin(FilePathGlobals.dataSavePath, # FilePathGlobals.img_list_file_name) # comments_list_path = osPathJoin(FilePathGlobals.dataSavePath, # FilePathGlobals.comments_list_file_name) # 確認檔案是否存在 if CheckFile(post_list_path): with open(post_list_path, encoding='utf-8', errors='ignore') as f: post_list_history = json.load(f, strict=False) # # 確認檔案是否存在 # if CheckFile(img_list_path): # with open(img_list_path, # encoding='utf-8', # errors='ignore') as f: # img_list_history = json.load(f, strict=False) # # 確認檔案是否存在 # if CheckFile(comments_list_path): # with open(comments_list_path, # encoding='utf-8', # errors='ignore') as f: # comments_list_history = json.load(f, strict=False) return post_list_history
def abgpsysexit(input, OPTIONS, message=""): """ """ if input.has_key(OPTIONS.target): key = OPTIONS.target elif OPTIONS.target == None: key = None else: # find by protein fref for k in input.keys(): if input[k]['gldobj'].protein_fref() == OPTIONS.target: key = k break else: key = None # get filename to write to if key: fname = "%s.bailout.log" % input[key]['gldobj'].protein_fref() elif key == None and OPTIONS.filewithloci: hdr = OPTIONS.filewithloci.split("/")[-1].split(".")[0] fname = "%s.bailout.log" % hdr elif key == None and OPTIONS.dirwithloci: hdr = OPTIONS.dirwithloci.split("/")[-1].split(".")[0] fname = "%s.bailout.log" % hdr else: fname = "%s.bailout.log" % ("Notargetapplied") # clean the complete directory from all files _file_cleanup(osListdir(OPTIONS.outdir), include_directories=True) fname = osPathJoin(OPTIONS.outdir, fname) fh = open(fname, 'w') fh.write(message + "\n") fh.close() # safely break out of the ABGP algorithm sysExit()
""" Alignment Based Gene Predictions settings: settings for (PSSM) Translational Start Sites (TSS) """ from settings.abgp import MAIN_ABGP_PATH from os.path import join as osPathJoin ################################################################################ # PSSM_IC file for Translational Start Sites ################################################################################ IC_TSS_DATA_FILE = osPathJoin(MAIN_ABGP_PATH,"datafiles/ic_start_5fungi.txt") IC_TSS_PATTERN_OFFSET = (9,7) ################################################################################ # Threshold values for TranslationalStartSite (ATG) PSSM cutoffs # Recommended to set spatiously low! # Non-cannonical splice sites are not supported yet ################################################################################ TSS_MIN_PSSM_SCORE = float(-1) TSS_ALLOW_NON_CANONICAL = False TSS_NON_CANONICAL_MIN_PSSM_SCORE = float(0) ################################################################################ # Threshold for defining if a TSS is optimal in a given range ################################################################################ TCODE_TSS_5P_WINDOW = 201 # nt coords TCODE_TSS_3P_WINDOW = 201 # nt coords TCODE_AVERAGE_SCORE = 0.845
def __init__(self): pass # end of class AcceptorPSSMOptions ################################################################################ # Which (Non)canonical splice sites are recognized/accepted? ################################################################################ CANONICAL_DONOR_SITES = ['GT'] NON_CANONICAL_DONOR_SITES = ['GC'] CANONICAL_ACCEPTOR_SITES = ['AG'] ################################################################################ # PSSM_IC files for donors & acceptors ################################################################################ IC_DONOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_donor_5fungi.txt") IC_DONOR_PATTERN_OFFSET = (3,4) IC_ACCEPTOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_acceptor_5fungi.txt") IC_ACCEPTOR_PATTERN_OFFSET = (6,3) IC_DONOR_NCGC_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_ncdonorgc_5fungi.txt") ################################################################################ # Threshold values for donor site PSSM cutoffs # Recommended to set spatiously low! # Non-cannonical donor sites are supported yet, but disabled by default ################################################################################ MIN_DONOR_PSSM_SCORE = 0.0 ALLOW_NON_CANONICAL_DONOR = True NON_CANONICAL_MIN_DONOR_PSSM_SCORE = float(3.0)
def CrawlerAction(days): try: # 初始化全域變數 FilePathGlobals.initialize() # 取得指定天數的 post list 資料,存成陣列 post_list_temp = get_post_list(days) # print("\n\n post_list:", post_list) # 從陣列裡面把每個 url 丟進 post_crawler 爬取 po 文內容,回傳 post_time, author, comments, img_name_list, img_url_list post_list = [] # img_list = [] # comments_list = [] for index, post in enumerate(post_list_temp): post_time, author, comments, img_name_list, img_url_list = post_crawler( post['url'], index) post_list_temp[index]['post_time'] = post_time post_list_temp[index]['author'] = author post_list_temp[index]['imgs'] = img_url_list post_list_temp[index]['comments'] = comments post_list.append(post_list_temp[index]) # # img_list # for imgUrl in img_url_list: # imgs_dict = {} # imgs_dict['post_index'] = postIndex # # print(post_time, author, img_name_list, img_url_list) # imgs_dict['img_url'] = imgUrl # img_list.append(imgs_dict) # # comments_list # for comment_info in comments: # comments_dict = {} # comments_dict = comment_info # comments_dict['post_index'] = postIndex # comments_list.append(comments_dict) # 顯示每一頁的資料 print(post_list[index]['title']) print(post_list[index]['url'], '\n') # 確認保存資料夾是否存在 CheckDir(FilePathGlobals.dataSavePath) # 從歷史數據中剔除重複資料 from Model.Crawler.HistoryData.JsonHistoryDataModel import HistoryDataLoad, UpdateRepeatData post_list_history = HistoryDataLoad() post_list, post_list_history = UpdateRepeatData( post_list, post_list_history) post_list.extend(post_list_history) # 存成字典post_list with open(osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.post_list_file_name), 'w', encoding="utf-8") as file: json.dump(post_list, file, ensure_ascii=False) # # img_list # with open(osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.img_list_file_name), # 'w', encoding="utf-8") as file: # json.dump(img_list, file, ensure_ascii=False) # # comments_list # with open(osPathJoin(FilePathGlobals.dataSavePath, FilePathGlobals.comments_list_file_name), # 'w', encoding="utf-8") as file: # json.dump(comments_list, file, ensure_ascii=False) except Exception as e: from traceback import format_exc print(format_exc())
def createblastdbs(input,GSG,OPTIONS,dbfraction=None,organism=None,acceptorfids=[],rejectorfids=[]): """ (Re)create blast-db's by masking the areas thar are incorporated in the GSG @type input: dict @param input: `input` data structure dictionary @type GSG: GenestructureOfCodingBlockGraphs @param GSG: GenestructureOfCodingBlockGraphs instance @type OPTIONS: optparse options instance @param OPTIONS: optparse options instance (with attribute 'abinitio') @type dbfraction: string @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation' @type organism: organism identifier @param organism: only recreate blastdb for this organism/gene identifier @type acceptorfids: list with integers @param acceptorfids: list of orf ids to accept @type rejectorfids: list with integers @param rejectorfids: list of orf ids to reject @attention: acceptorfids and rejectorfids are only used when organism is specified! """ seqsindb = {} for org in input.keys(): # if organism is given, do only this one if organism and org!=organism: continue # acceptorfids anc rejectorfids only valid in combi with `organism` if not organism: acceptorfids, rejectorfids = [], [] # assign blast database name / multi fasta file and open filehandle uniquetag = get_random_string_tag() fname = '%s-blastdb-%s.fa' % (uniquetag,org) fullpath = osPathJoin(OPTIONS.outdir,fname) fh = open(fullpath,'w') seqsindb[org] = 0 # distinct cases possible: if len(GSG): # there is already a GSG, so this is not the first blast iteration # do not apply a shortcut when OPTIONS.abinitio == False coords = GSG.omsr2mask(org) if dbfraction == 'GSGupstream': # take only orfs LEFT of the first CBG in GSG max_orf_nt_start = max(GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids,rejectorfids=rejectorfids) elif dbfraction == 'GSGdownstream': # take only orfs RIGTH of the last CBG in GSG min_orf_nt_end = min(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end, acceptorfids=acceptorfids,rejectorfids=rejectorfids) elif dbfraction == 'GSGcentral': # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!) max_orf_nt_start = max(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 min_orf_nt_end = min(GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end, max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids,rejectorfids=rejectorfids) else: # dbfraction equals 'all' or None -> no limitation, just take all orfs! # do only the general limitation on sublists of orfids orflist = input[org]['orfs'].get_elegiable_orfs( acceptorfids=acceptorfids,rejectorfids=rejectorfids) # create masked fasta of this sequence part only newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,orflist=orflist,header_prefix=org) # write to file and count accessions in this file -> seqsindb[org] fh.write(newfasta) seqsindb[org] = newfasta.count(">") else: # No filled GSG objects -> no a priori knowledge yet # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio for orf in input[org]['orfs'].orfs: # in case not abinitio, make only a db of orfs in teh current annotation! if OPTIONS.abinitio == False and orf.id not in input[org]['orfid-genestructure']: continue if orf.id in rejectorfids: # ignore Orfs that are listed as to-be-ignored continue if acceptorfids and orf.id not in acceptorfids: # ignore Orfs that are not listed as to-be-accepted continue # write fasta of orf to file fh.write(orf.tofasta(header="%s_orf_%s" % (org,orf.id))+"\n") # increase seqsindb[org] counter seqsindb[org]+=1 # close the filehandle fh.close() # run formatdb formatdb(fname=fullpath) # set name of blastdb in infodict input[org]['blastdb'] = fullpath # return the counter of how much orf sequences are stored in the blast database return seqsindb
# >85 BLOSUM-62 (11,1) # ############################################################################## # # blosum62 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EBLOSUM62") # pam30 = parse_emboss_matrix_file("/opt/ab/share/EMBOSS/data/EPAM30") # ############################################################################## # Python Imports from os.path import join as osPathJoin # import paths to BLOSUM and PAM matrices from executables import EMBOSS_DATA_DIRECTORY BLOSUM45_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM45") BLOSUM62_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM62") BLOSUM80_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EBLOSUM80") PAM30_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM30") PAM70_PATH = osPathJoin(EMBOSS_DATA_DIRECTORY, "EPAM70") def parse_emboss_matrix_file(fname): """ Parse an EMBOSS protein similarity matrix file Supported types are BLOSUM and PAM matrices """ lines = [] matrix = {} aas = [] for line in open(fname).readlines():
# python PYTHON_PATH = "/usr/bin/python2.6" # tested are 2.4, 2.6 and 2.7.4 PYTHON_VERSION = "python2.6.2" # emboss EMBOSS_EXECUTABLES_PATH = "/usr/bin" EMBOSS_DATA_DIRECTORY = "/usr/share/EMBOSS/data" EMBOSS_VERSION = "4.0.0" # perl PERL_PATH = "perl" # only required in case a MySQL GGB database is # coupled to store and visualize ABFGP output. PERL_VERSION = "?" # not used at all. ################################################################################ # Full paths to executables ################################################################################ EXECUTABLE_BLASTALL = osPathJoin(BASEPATH, "software/blast-2.2.8/blastall") EXECUTABLE_FORMATDB = osPathJoin(BASEPATH, "software/blast-2.2.8/formatdb") EXECUTABLE_GETORF = osPathJoin(EMBOSS_EXECUTABLES_PATH, "getorf") EXECUTABLE_TCODE = osPathJoin(EMBOSS_EXECUTABLES_PATH, "tcode") EXECUTABLE_TRANSEQ = osPathJoin(EMBOSS_EXECUTABLES_PATH, "transeq") EXECUTABLE_CLUSTALW = osPathJoin(BASEPATH, "software/clustalw-1.83/clustalw") EXECUTABLE_SIGNALP = osPathJoin(BASEPATH, "software/signalp-3.0/signalp") EXECUTABLE_TMHMM = osPathJoin(BASEPATH, "software/tmhmm-2.0/bin/tmhmm") EXECUTABLE_SFM = osPathJoin(BASEPATH, "software/ScanForMatches/scan_for_matches") EXECUTABLE_HMMPATH = osPathJoin(BASEPATH, "software/hmmer-2.3.2") EXECUTABLE_HMMSEARCH = osPathJoin(EXECUTABLE_HMMPATH, "hmmsearch") EXECUTABLE_HMMBUILD = osPathJoin(EXECUTABLE_HMMPATH, "hmmbuild") EXECUTABLE_CEXPANDER_PATH = osPathJoin(BASEPATH, "software/cexpander-1.0") EXECUTABLE_CEXPANDER_ALLVSALL = osPathJoin(EXECUTABLE_CEXPANDER_PATH,
# Module metadata __authors__ = "Ate van der Burgt" __license__ = "MIT" # Python imports import gzip from subprocess import Popen, PIPE from os.path import ( join as osPathJoin, abspath as osPathAbspath, dirname as osPathDirname, ) # absolute paths to executables located within the same directory EXECUTABLE_ISSINGLEFASTADNA = osPathJoin(osPathDirname(osPathAbspath(__file__)),"issinglefastadna.sh") EXECUTABLE_ISSINGLEFASTAPROTEIN = osPathJoin(osPathDirname(osPathAbspath(__file__)),"issinglefastaprotein.sh") ######################################################################## #### Exceptions ######################################################################## class NoSingleFastaFile(Exception): pass ######################################################################## #### Validation functions ######################################################################## def IsSingleFastaDna(fname,executable=EXECUTABLE_ISSINGLEFASTADNA): """
def favicon(): return send_from_directory(osPathJoin(webShellRootPath, 'static'), 'images/favicon.ico', mimetype='image/vnd.microsoft.icon')
def robots(): return send_from_directory(osPathJoin(webShellRootPath, 'static'), 'robots.txt', mimetype='text/plain')
# python PYTHON_PATH = "/usr/bin/python2.6" # tested are 2.4, 2.6 and 2.7.4 PYTHON_VERSION = "python2.6.2" # emboss EMBOSS_EXECUTABLES_PATH = "/usr/bin" EMBOSS_DATA_DIRECTORY = "/usr/share/EMBOSS/data" EMBOSS_VERSION = "4.0.0" # perl PERL_PATH = "perl" # only required in case a MySQL GGB database is # coupled to store and visualize ABFGP output. PERL_VERSION = "?" # not used at all. ################################################################################ # Full paths to executables ################################################################################ EXECUTABLE_BLASTALL = osPathJoin(BASEPATH,"software/blast-2.2.8/blastall") EXECUTABLE_FORMATDB = osPathJoin(BASEPATH,"software/blast-2.2.8/formatdb") EXECUTABLE_GETORF = osPathJoin(EMBOSS_EXECUTABLES_PATH,"getorf") EXECUTABLE_TCODE = osPathJoin(EMBOSS_EXECUTABLES_PATH,"tcode") EXECUTABLE_TRANSEQ = osPathJoin(EMBOSS_EXECUTABLES_PATH,"transeq") EXECUTABLE_CLUSTALW = osPathJoin(BASEPATH,"software/clustalw-1.83/clustalw") EXECUTABLE_SIGNALP = osPathJoin(BASEPATH,"software/signalp-3.0/signalp") EXECUTABLE_TMHMM = osPathJoin(BASEPATH,"software/tmhmm-2.0/bin/tmhmm") EXECUTABLE_SFM = osPathJoin(BASEPATH,"software/ScanForMatches/scan_for_matches") EXECUTABLE_HMMPATH = osPathJoin(BASEPATH,"software/hmmer-2.3.2") EXECUTABLE_HMMSEARCH = osPathJoin(EXECUTABLE_HMMPATH,"hmmsearch") EXECUTABLE_HMMBUILD = osPathJoin(EXECUTABLE_HMMPATH,"hmmbuild") EXECUTABLE_CEXPANDER_PATH = osPathJoin(BASEPATH,"software/cexpander-1.0") EXECUTABLE_CEXPANDER_ALLVSALL = osPathJoin(EXECUTABLE_CEXPANDER_PATH, "prep_launch.py")
# Python imports that might be absent depending on the version of Python from sets import Set # deprecated but backwards compatible in newer verions of python from optparse import OptionParser from optparse import OptionGroup from optparse import OptionValueError from StringIO import StringIO # Most likely Numeric is not installed; this is solved by having it installed # within the ABFGP code tree # The path to Numeric is obtained either from the settings dir (absolute) # or the path is reconstructed (relative) # absolute path acquisition from settings.abgp import MAIN_ABGP_PATH as BASEPATH sys.path.append(osPathJoin(BASEPATH, "requiredmodules")) from Numeric import zeros, where, greater, greater_equal # relative path acquisition sys.path.append(osPathJoin(osPathDirname(osPathAbspath(__file__)), "requiredmodules")) from Numeric import zeros, where, greater, greater_equal from abgp_etc import abgpsysexit from abgp_etc import _blastdb_cleanup from abgp_etc import _blastdb_cleanup, _file_cleanup from abgp_exceptions import InproperlyAppliedArgument from abgp_exceptions import NoCrossdataApplied, NoInputApplied from abgp_geneconfirmation import * from abgp_geneconfirmation import geneconfirmation from abgpgenelocusdirectory import AbgpGeneLocusDirectory from abgpgenelocusdirectory import IsAbgpGeneLocusDirectory
""" Alignment Based Gene Predictions settings: EMBOSS data files """ from os.path import join as osPathJoin from abgp import MAIN_ABGP_PATH ################################################################################ # Full paths to Amino Acid similarity matrices ################################################################################ BLOSUM62_PATH = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM62") BLOSUM80_PATH = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM80") BLOSUM45_PATH = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EBLOSUM45") PAM30_PATH = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EPAM30") PAM70_PATH = osPathJoin(MAIN_ABGP_PATH,"datafiles/EMBOSS/EPAM70")
""" Alignment Based Gene Predictions settings: settings for (PSSM) Translational Start Sites (TSS) """ from settings.abgp import MAIN_ABGP_PATH from os.path import join as osPathJoin ################################################################################ # PSSM_IC file for Translational Start Sites ################################################################################ IC_TSS_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_start_5fungi.txt") IC_TSS_PATTERN_OFFSET = (9, 7) ################################################################################ # Threshold values for TranslationalStartSite (ATG) PSSM cutoffs # Recommended to set spatiously low! # Non-cannonical splice sites are not supported yet ################################################################################ TSS_MIN_PSSM_SCORE = float(-1) TSS_ALLOW_NON_CANONICAL = False TSS_NON_CANONICAL_MIN_PSSM_SCORE = float(0) ################################################################################ # Threshold for defining if a TSS is optimal in a given range ################################################################################ TCODE_TSS_5P_WINDOW = 201 # nt coords TCODE_TSS_3P_WINDOW = 201 # nt coords TCODE_AVERAGE_SCORE = 0.845 TSS_IS_OPTIMAL_5P_WINDOW = 200 # nt coords, was 250 in the juli2009 abgp_tssanalyses test TSS_IS_OPTIMAL_3P_WINDOW = 100 # nt coords
################################################################################ ### PssmObjectCollectionGraph class and inheriting classes #### ################################################################################ # Python imports from sets import Set # Make sure Numeric is installed somewhere!!! # For convenience, it is installed in ./requiredmodules import sys from os.path import join as osPathJoin from settings.abgp import MAIN_ABGP_PATH as BASEPATH sys.path.append(osPathJoin(BASEPATH,"requiredmodules")) from Numeric import zeros # graphAbgp imports from graph_organism import OrganismGraph from subclass_sitealignment import BasalSiteAlignmentFunctions, sort_by_cumulative_score from subclass_pssmobjects import BasalPSSMObjectGraphFunctions import conversion import ordering import graphPlus from exceptions import * # Global Varibale Imports from settings.translationalstartsites import TCODE_TSS_5P_WINDOW, TCODE_TSS_3P_WINDOW class PssmObjectCollectionGraph(OrganismGraph,BasalSiteAlignmentFunctions,BasalPSSMObjectGraphFunctions): """ """
# Python imports that might be absent depending on the version of Python from sets import Set # deprecated but backwards compatible in newer verions of python from optparse import OptionParser from optparse import OptionGroup from optparse import OptionValueError from StringIO import StringIO # Most likely Numeric is not installed; this is solved by having it installed # within the ABFGP code tree # The path to Numeric is obtained either from the settings dir (absolute) # or the path is reconstructed (relative) # absolute path acquisition from settings.abgp import MAIN_ABGP_PATH as BASEPATH sys.path.append(osPathJoin(BASEPATH, "requiredmodules")) from Numeric import zeros, where, greater, greater_equal # relative path acquisition sys.path.append( osPathJoin(osPathDirname(osPathAbspath(__file__)), "requiredmodules")) from Numeric import zeros, where, greater, greater_equal from abgp_etc import abgpsysexit from abgp_etc import _blastdb_cleanup from abgp_etc import _blastdb_cleanup, _file_cleanup from abgp_exceptions import InproperlyAppliedArgument from abgp_exceptions import NoCrossdataApplied, NoInputApplied from abgp_geneconfirmation import * from abgp_geneconfirmation import geneconfirmation from abgpgenelocusdirectory import AbgpGeneLocusDirectory
""" Alignment Based Gene Predictions settings: EMBOSS data files """ from os.path import join as osPathJoin from abgp import MAIN_ABGP_PATH ################################################################################ # Full paths to Amino Acid similarity matrices ################################################################################ BLOSUM62_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM62") BLOSUM80_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM80") BLOSUM45_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EBLOSUM45") PAM30_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EPAM30") PAM70_PATH = osPathJoin(MAIN_ABGP_PATH, "datafiles/EMBOSS/EPAM70")
def createblastdbs(input, GSG, OPTIONS, dbfraction=None, organism=None, acceptorfids=[], rejectorfids=[]): """ (Re)create blast-db's by masking the areas thar are incorporated in the GSG @type input: dict @param input: `input` data structure dictionary @type GSG: GenestructureOfCodingBlockGraphs @param GSG: GenestructureOfCodingBlockGraphs instance @type OPTIONS: optparse options instance @param OPTIONS: optparse options instance (with attribute 'abinitio') @type dbfraction: string @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation' @type organism: organism identifier @param organism: only recreate blastdb for this organism/gene identifier @type acceptorfids: list with integers @param acceptorfids: list of orf ids to accept @type rejectorfids: list with integers @param rejectorfids: list of orf ids to reject @attention: acceptorfids and rejectorfids are only used when organism is specified! """ seqsindb = {} for org in input.keys(): # if organism is given, do only this one if organism and org != organism: continue # acceptorfids anc rejectorfids only valid in combi with `organism` if not organism: acceptorfids, rejectorfids = [], [] # assign blast database name / multi fasta file and open filehandle uniquetag = get_random_string_tag() fname = '%s-blastdb-%s.fa' % (uniquetag, org) fullpath = osPathJoin(OPTIONS.outdir, fname) fh = open(fullpath, 'w') seqsindb[org] = 0 # distinct cases possible: if len(GSG): # there is already a GSG, so this is not the first blast iteration # do not apply a shortcut when OPTIONS.abinitio == False coords = GSG.omsr2mask(org) if dbfraction == 'GSGupstream': # take only orfs LEFT of the first CBG in GSG max_orf_nt_start = max( GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids, rejectorfids=rejectorfids) elif dbfraction == 'GSGdownstream': # take only orfs RIGTH of the last CBG in GSG min_orf_nt_end = min( GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( min_orf_end=min_orf_nt_end, acceptorfids=acceptorfids, rejectorfids=rejectorfids) elif dbfraction == 'GSGcentral': # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!) max_orf_nt_start = max( GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 min_orf_nt_end = min( GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( min_orf_end=min_orf_nt_end, max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids, rejectorfids=rejectorfids) else: # dbfraction equals 'all' or None -> no limitation, just take all orfs! # do only the general limitation on sublists of orfids orflist = input[org]['orfs'].get_elegiable_orfs( acceptorfids=acceptorfids, rejectorfids=rejectorfids) # create masked fasta of this sequence part only newfasta = input[org]['orfs'].tomaskedfasta(coords=coords, orflist=orflist, header_prefix=org) # write to file and count accessions in this file -> seqsindb[org] fh.write(newfasta) seqsindb[org] = newfasta.count(">") else: # No filled GSG objects -> no a priori knowledge yet # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio for orf in input[org]['orfs'].orfs: # in case not abinitio, make only a db of orfs in teh current annotation! if OPTIONS.abinitio == False and orf.id not in input[org][ 'orfid-genestructure']: continue if orf.id in rejectorfids: # ignore Orfs that are listed as to-be-ignored continue if acceptorfids and orf.id not in acceptorfids: # ignore Orfs that are not listed as to-be-accepted continue # write fasta of orf to file fh.write( orf.tofasta(header="%s_orf_%s" % (org, orf.id)) + "\n") # increase seqsindb[org] counter seqsindb[org] += 1 # close the filehandle fh.close() # run formatdb formatdb(fname=fullpath) # set name of blastdb in infodict input[org]['blastdb'] = fullpath # return the counter of how much orf sequences are stored in the blast database return seqsindb
pass # end of class AcceptorPSSMOptions ################################################################################ # Which (Non)canonical splice sites are recognized/accepted? ################################################################################ CANONICAL_DONOR_SITES = ['GT'] NON_CANONICAL_DONOR_SITES = ['GC'] CANONICAL_ACCEPTOR_SITES = ['AG'] ################################################################################ # PSSM_IC files for donors & acceptors ################################################################################ IC_DONOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_donor_5fungi.txt") IC_DONOR_PATTERN_OFFSET = (3, 4) IC_ACCEPTOR_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_acceptor_5fungi.txt") IC_ACCEPTOR_PATTERN_OFFSET = (6, 3) IC_DONOR_NCGC_DATA_FILE = osPathJoin(MAIN_ABGP_PATH, "datafiles/ic_ncdonorgc_5fungi.txt") ################################################################################ # Threshold values for donor site PSSM cutoffs # Recommended to set spatiously low! # Non-cannonical donor sites are supported yet, but disabled by default ################################################################################ MIN_DONOR_PSSM_SCORE = 0.0 ALLOW_NON_CANONICAL_DONOR = True NON_CANONICAL_MIN_DONOR_PSSM_SCORE = float(3.0)