Example #1
0
PREPROCESSOR_SCRIPT_DIR = os.path.join(PAR_DIR, 'preprocessors')
RESOURCES_DIR = os.path.join(PAR_DIR, 'resources')

import pepInput
import shutil
import ArgLib
import DataFile
import pepInput
import glob
import pickle

if __name__ == '__main__':
    print 'dtadir is the directory containing the mzXML files to analyze'
    print 'peaks is a dictionary mapping {experiment_name: peaks csv}'
    print 'output is the directory to move all files to and set up the project in'
    options = ArgLib.parse(['init', 'dtadir', 'peaks', 'output'])

    print 'options.output: %s' % (options.output)
    print 'normpath(options.output): %s' % (os.path.normpath(options.output))
    # Fails with an OSError if directory already exists
    os.makedirs(options.output)

    # Create database
    args = ['--sqlite', os.path.join(options.output, 'results.db')]
    print 'Models.py dir: %s' % (DATABASE_SCRIPT_DIR)
    DataFile.executeProcess(DATABASE_SCRIPT_DIR, 'Models.py', args)

    # Make experiment directories
    # Structure
    # /options.output
    # .../ExperimentName
    return scanFDict


if __name__ == "__main__":
    options = ArgLib.parse(
        [
            "init",
            "dtadir",
            "config",
            "model",
            "output",
            "columns",
            "verbose",
            "paircutoff",
            "ppmsyserror",
            "ppmstd",
            "ppmpenalty",
            "ambigpenalty",
            "minedge",
            "maxedge",
            "alpha",
            "subgraphcut",
            "symbolmap",
        ]
    )
    epStep = 0.00025
    maxEp = 0.1

    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, "r") as fin:
Example #3
0
        try:
            for i in range(int(dataDict['Num Ambig Edges'].max())):
                outFile.write('\t'.join([str(i), str(len(np.where(dataDict['Num Ambig Edges'] == i)[0]))]) + '\n')
        except ValueError:
            outFile.write('\t'.join(['N/A', 'N/A']) + '\n')
        except KeyError:
            print 'ERROR: No data for Ambiguous Edges available'

        try:
            writeHistogram(outFile, dataDict['PScore'], '%s %s PScore' % (progName, name))
        except KeyError:
            print 'ERROR: No Data for PScore available'
    
unitTestName = 'LADS Unit Test'
if __name__ == '__main__':
    options = ArgLib.parse(['init', 'denovoscript', 'dtadir', 'config', 'model', 'output', 'columns', 'verbose', 'paircutoff', 'ppmsyserror', 'ppmstd', 'ppmpenalty', 'ambigpenalty', 'minedge', 'maxedge', 'alpha', 'lads', 'sequest', 'mascot', 'pepnovo', 'database', 'mainprogname', 'progdict', 'comp', 'subgraphcut', 'symbolmap', 'pnovo', 'peaks', 'combined', 'srchid'])
    
    outBase = os.path.splitext(options.output)[0]
    paramsDict = ArgLib.parseInitFile(options.init, options)

    interpreter = 'python2.6'
    if options.denovoscript:
        DNSprog = options.denovoscript
        progDict = {unitTestName: 'LADS'}
    else:
        progDict = {}

    if options.progdict:
        print options.progdict
        progDict = eval(options.progdict)
    else:
Example #4
0
    #print procSeq
    if cutOff == 0:
        return ''.join(procSeq), []
    else:
        ambig_edges = []
        for i, aa in enumerate(procSeq):
            if LCs[i] < cutOff:
                procSeq[i] = ambigAA
                ambig_edges += [(0, Constants.aminoacids[aa][2])]
        return ''.join(procSeq), ambig_edges


if __name__ == '__main__':
    print 'In this program, the PEAKS argument is just the location of the PEAKS output to parse. Number argument indicates ALC cutoff to form ambig edges (set to 0 to not form any amibiguous edges'
    options = ArgLib.parse(['init', 'output', 'symbolmap', 'peaks', 'cutoff'])

    AMBIG_AA = '@'

    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'PEAKS': 'PEAKS'}, symbolMap, paramsDict)
    #print seqMap

    scanInfo = DataFile.getScanInfo(options.peaks, delimiter=',')[1]
    if 'Peptide' in scanInfo[0]:
        seq_col = 'Peptide'
    else:
        seq_col = 'Sequence'
                    'em': top_item[2][0],
                    'log_em': top_item[2][1],
                    'sib': item[1][0],
                    'con': item[1][1],
                    'occ': item[1][2],
                    'top': True
                }]

    print 'Executing update'
    connection.execute(stmt, update_data)


if __name__ == '__main__':
    print 'modmaxcounts argument is number of iterations in initial EM over all results, maxcounts argument indicates maximum number of iterations before expectation-maximization terminates after reranking is completed (on the top ranked results only). Set fracs to \"all\" to run over all fractions for experiment or supply desired fracs to run EM over separated by commas'
    options = ArgLib.parse([
        'init', 'sqlitedb', 'experimentname', 'fracs', 'maxcounts',
        'modmaxcounts', 'output'
    ])

    t1 = time.time()
    paramsDict = ArgLib.parseInitFile(options.init, options)
    outBase = os.path.splitext(options.output)[0]

    engine = create_engine('sqlite:///' + options.sqlitedb, echo=True)
    conn = engine.connect()

    conn.execute("PRAGMA max_page_count = max_page;")
    conn.execute("PRAGMA temp_store = 2;")
    conn.execute("PRAGMA page_size")

    try:
        experiment_id = conn.execute(
Example #6
0
    try:
        engine.execute('ALTER TABLE %s ADD COLUMN %s %s' %
                       (table_name, column_name, column_type))
    except sqlalchemy.exc.OperationalError:
        print 'CRITICAL ERROR: column %s already exists in table' % column_name


def get_tissue_experiment_map(tissue_paths):
    tissue_map = defaultdict(list)
    for experiment_dir in tissue_paths:
        db_loc = os.path.join(experiment_dir, 'results.db')
        engine = create_engine('sqlite:///' + db_loc, echo=True)
        conn = engine.connect()

        experiments = Models.fetch_all_experiments(conn)
        for experiment_id, experiment_name in experiments:
            tissue_map[os.path.basename(
                os.path.normpath(path))] += [experiment_name]

        conn.close()

    return tissue_map


if __name__ == '__main__':
    print 'Creates DB from defined schema'
    options = ArgLib.parse(['sqlitedb'])

    engine = create_engine('sqlite:///' + options.sqlitedb, echo=True)
    Base.metadata.create_all(engine)
Example #7
0
        for seq in rand_smpl:
            out_file.write(seq + '\n')
        out_file.close()
    else:
        print('maxsize g')
        out_file = open(out_file_name, 'w')
        for seq in new_seqs:
            out_file.write(seq + '\n')
        out_file.close()


if __name__ == '__main__':
    print(
        'This program will take a FASTA file from the --lads argument and output a six-frame translation of the file to output. Number refers to maximum size of sequence in resulting FASTA file. If a chromosomal region exceeds this length with no stop codons, the sequence will be chunked with a 100 aa overhang at each edge. Minimum Length of peptide in FASTA file is 5.'
    )
    options = ArgLib.parse(['lads', 'output', 'number'])

    outFile = open(options.output, 'w')
    #chunkSize = int(options.number)

    for seqName, sequence in sequenceGenerator(options.lads):
        #for frame in [1, 2, 3, -1, -2, -3]:
        for frame in [1, 2, 3]:
            #print(seqName, frame)
            transSeq = getTranslation(sequence.upper(), frame)
            #chunkAndWriteSequence(outFile, seqName, transSeq, frame, len(sequence), lineLength=60, chunkSize=2000, dontReadThrough=['X'], minPeptLength=10, overhang=50)
            #transSeqName = seqName + ('_+' if frame > 0 else '_') + str(frame)
            transSeqName = getTransSeqNameForGFY(seqName, frame)
            writeSequence(outFile, transSeqName, transSeq)

    outFile.close()
    if start == None:
        start = list(findAll(peptide,proteinSeq))
        end = [startInd + len(peptide) for startInd in start]
    proteinSeq = '-' + proteinSeq + '-'
    return [proteinSeq[start[i]] + '.' + proteinSeq[start[i]+1:end[i]+1] + '.' + proteinSeq[end[i]+1] for i in range(len(start))]

def getStartAndEndInds(proteinSeq, peptide):
    startInd = proteinSeq.index(peptide)
    return startInd, startInd+len(peptide)

def removeNoncanonicalAminoAcids(subjectSeq):
    return subjectSeq.replace('Z', 'Q').replace('B', 'N').replace('X', '').replace('U', 'C').replace('L', 'I').replace('J', '')

if __name__ == '__main__':
    print 'This program will take a tdv file of  BLAST results indexed by scanF and attempt to explain the discrepancies using the unimod modification dictionary. Mainprogname is a dict mapping the name of the score and peptide field to their corresponding URIs. Fields are ScanF, Peptide, References, Ambig Edges (optional), Ref Peptide, and Num Identical'
    options = ArgLib.parse(['init', 'output', 'ppmstd', 'comp', 'unimoddict', 'mainprogname'], [{'opts': ('-f', '--fasta'), 'attrs': {'type': 'string', 'dest': 'fasta', 'help': 'Location of reference fasta containing reference proteins (same file used to generate BLAST DB).'}}])

    paramsDict = ArgLib.parseInitFile(options.init, options)

    infoDict = eval(options.mainprogname)

    with open(options.unimoddict) as fin:
        unimodDict = pickle.load(fin)
    hashedUnimodDict = hashUnimodDict(unimodDict)

    outFile = open(options.output, 'w')
    cols = ['ScanF', 'Score', 'Peptide', 'Unmod Peptide', 'References', 'Modifications', 'DB Peptide', 'Alignment Score']
    if 'Ambig Edges' in infoDict:
        cols.insert(2, 'Ambig Edges')
        
    outFile.write('\t'.join([col for col in cols]) + '\n')
Example #9
0
    for item in proc_tag_graph.values():
        scanData = {'ScanF': item['ScanF'], 'Alignment Score': item['Alignment Score'], 'Matching Tag Length': item['Matching Tag Length'], 'PEAKS ALC (%)': item['De Novo Score'], 'PEAKS Peptide': item['De Novo Peptide'], 'Decoy?': item['Decoy Status']}
        for context in item['Context']:
            scanData['Context'] = context[0]
            scanData['Modifications'] = context[1]
            scanData['Proteins'] = context[2]
            
            outFile.write('\t'.join([str(scanData[col]) for col in cols]) + '\n')
            
    outFile.close()
                                                                        

if __name__ == '__main__':
    print 'dtadir argument points to parent directory of de_novo and mzML files'
    print 'output is a string of values to append to the de novo filename in the output'
    options = ArgLib.parse(['init', 'dtadir', 'ppmstd', 'modtolerance', 'unimoddict', 'modmaxcounts', 'maxcounts', 'fmindex', 'denovo', 'model', 'config', 'output'], [{'opts': ('-F', '--fraction'), 'attrs': {'type': 'int', 'dest': 'fraction', 'help': 'Fraction to run TAG_GRAPH on'}}, {'opts': ('-x', '--splittaxon'), 'attrs': {'dest': 'splittaxon', 'action': 'store_true', 'default': False, 'help': 'Flag. For searches of metaproteomic databases, split identical context entries by taxon for accurate consideration via EM.'}}])

    fileFound = False
    
    outDir = os.path.join(options.dtadir, 'taggraph')
    peaksDir = os.path.join(options.dtadir, 'de_novo')
    dataDir = os.path.join(options.dtadir, 'data')
    localDtaDir = ''
    try:
        # This should throw an exception if the directory for dta files does not yet exist (it will then be created)
        ''' Replace os.path.sep with '/' to fix Windows backslash issues. --smp
        dtaDir = glob.glob(dataDir + os.path.sep + '*f%02d'%options.fraction)[0] + os.path.sep
        '''
        localDtaDir = glob.glob(dataDir + '/' + '*f%02d'%options.fraction)[0] + '/'
        print localDtaDir+"\n"
    except IndexError:
Example #10
0
    #session.commit()

    for fraction_name in fractions:
        results_files = glob.glob(results_dir +  os.path.sep + '*_' + fraction_name + '_*tdv')
        print fraction_name, results_files
        if importTAGGRAPHResults(connection, experiment_name, fraction_name, results_files):
            print 'Fraction %s imported successfully'%fraction_name
        else:
            print 'ERROR: Unable to import results for fraction %s'%fraction_name

    return True



if __name__ == '__main__':
    options = ArgLib.parse(['sqlitedb', 'taggraph', 'init', 'fmindex', 'modtolerance', 'ppmstd', 'maxcounts', 'modmaxcounts', 'experimentname', 'fracs'], optArgs=[{ 'opts': ('-Y', '--type'), 'attrs': {'type': 'string', 'dest': 'type', 'default': None, 'help': 'Value is either experiment or single, defines where import is an experiment import or single sample import'} }])

    print 'If importing just a single fraction, ignore all arguments except sqlitedb, taggraph, type (set as single), experimentname, fracs (set as fraction name)'
    print 'If type of import is experiment, set taggraph argument to directory where results are located, and fracs to a tuple of fractions to import. TAGGRAPH parameters (init, fmindex, modtolerance, ppmstd, maxcounts, modmaxcounts) will be saved in db for record keeping'

    engine = create_engine('sqlite:///' + options.sqlitedb, echo=True)
    #Session = sessionmaker(bind=engine)

    #session = Session()

    conn = engine.connect()

    if options.type == "single":
        importTAGGRAPHResults(conn, options.experimentname, options.fracs, eval(options.taggraph))
    elif options.type == "experiment":
            binsDict[numBins-1][1] += 1
        else:
            binsDict[bin][1] += 1

    for bin in binsDict:
        binsDict[bin][2] = binsDict[bin][0] - binsDict[bin][1]

    outFile.write('\n%s Scan Number Difference Distribution. Max Diff: %i' % (name, maxDiff) + '\n')
    outFile.write('\t'.join(['Diff Bin', 'Test Pairs', 'True Pairs', 'False Pairs']) + '\n')
    for i in range(numBins):
        outFile.write('\t'.join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + '\n')
    
    
if __name__ == '__main__':
    print 'Model refers to svmmodel used'
    options = ArgLib.parse(['dtadir', 'combined', 'sequest', 'mascot', 'database', 'output', 'ppmstd', 'init', 'symbolmap'])
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(An.searchprogs, options)

    dbDict = DataFile.getDBInfo(options.database)
    infoMap = dbDict['infoMap']
    
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    
    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)

    processedInfo = {} 
    
    if options.mascot:
Example #12
0
        else:
            return (seq, None)
    except KeyError:
        return False

def parseDBScans(fDict, prog, seqMap, dbDict):
    processedInfo = {}
    for csvfile in fDict.keys():
        MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'], delimiter=',')
        processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'])
    
    return processedInfo

#Number argument refers to minimum number of search prog results which have the same peptide for it to be included in the final output
if __name__== '__main__':
    options = ArgLib.parse(['init', 'sequest', 'lads', 'mascot', 'output', 'database', 'symbolmap', 'number'])
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    dbDict = DataFile.getDBInfo(options.database)
    progDict = ArgLib.getProgDict(An.searchprogs, options)
    
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)
    
    if hasattr(options, 'number'):
        minNumScans = int(options.number)
    else:
        minNumScans = 1
        
    processedInfo = {}  
Example #13
0
    for bin in binsDict:
        binsDict[bin][2] = binsDict[bin][0] - binsDict[bin][1]

    outFile.write("\n%s Scan Number Difference Distribution. Max Diff: %i" % (name, maxDiff) + "\n")
    outFile.write("\t".join(["Diff Bin", "Test Pairs", "True Pairs", "False Pairs"]) + "\n")
    for i in range(numBins):
        outFile.write(
            "\t".join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + "\n"
        )


if __name__ == "__main__":
    print "Model refers to svmmodel used"
    options = ArgLib.parse(
        ["dtadir", "combined", "sequest", "mascot", "database", "output", "ppmstd", "init", "symbolmap"]
    )

    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(An.searchprogs, options)
    dbDict = DataFile.getDBInfo(options.database)
    with open(options.symbolmap, "r") as fin:
        symbolMap = pickle.load(fin)

    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)
    outFile = open(options.output, "w")

    print options.dtadir
    dtaList = glob.glob(options.dtadir + "/*.dta")
    scanFDict = getScanFDict(dtaList)
    print "Saved %d Generators to Gens.Iterative.lh5" % Generators_Original['XYZList'].shape[0]
    return


if __name__ == "__main__":
    print """\nAssigns data to generators that was not originally used in the
clustering.\n
Output:
-- Assignments.h5: a matrix of assignments where each row is a vector
corresponding to a data trajectory. The values of this vector are the cluster
assignments.
-- Assignments.h5.RMSD: Gives the RMSD from the assigned frame to its Generator.
\n"""

    arglist=["projectfn", "generators", "atomindices","outdir","rmsdcutoff","assignments","assrmsd","stride"]
    options=ArgLib.parse(arglist)
    print sys.argv
    print options
    P1=Project.Project.LoadFromHDF(options.projectfn)
    AInd=numpy.loadtxt(options.atomindices, int)
    Generators=Trajectory.Trajectory.LoadTrajectoryFile(options.generators,Conf=P1.Conf)
    if options.assignments != "None":
        ass = Serializer.LoadData(options.assignments)
    else:
        ass = None
    if options.assrmsd != "None":
        assrmsd = Serializer.LoadData(options.assrmsd)
    else:
        assrmsd = None
    stride = int( options.stride )
    run(P1, AInd, Generators,options.outdir,float(options.rmsdcutoff),ass, assrmsd,stride)
    return featureNames

def writeFeatures(featureList, rank, qid, outFile, comment=""):
    outFile.write('%i ' % (rank,) + ' '.join(['%i:%f' % (i+1, feature) for i, feature in enumerate(featureList)]) + '# %s\n' % (comment,))

def printFeatureNames(featureNames):
    for i, feature in enumerate(featureNames):
        print '%i. %s' % (i+1, feature)

def printFeatures(featureNames, featureList):
    for i, feature in enumerate(featureNames):
        print '%i. %s: %f' % (i+1, feature, featureList[i])

if __name__ == '__main__':
    print 'This program generates LETOR format training data for the training of a discriminator. dtadir is of the formate {/loc of dtadir: (loc of LADS SequenceDTAsTDV.py LOG file, loc of combined SEQUEST-MASCOT database results'
    options = ArgLib.parse(['init', 'dtadir', 'ppmstd', 'symbolmap', 'output', 'model', 'config'])

    paramsDict = ArgLib.parseInitFile(options.init, options)
    pairConfigurations = paramsDict['Pair Configurations']
    ppm = float(options.ppmstd)

    dtadirInfo = eval(options.dtadir)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    seqMap = seqMap['LADS Unit Test']

    PNet = PN.ProbNetwork(options.config, options.model)
    outFile = open(options.output, 'w')
Example #16
0
        progDict[name] = prog


def getAllScanF(processedInfo):
    scanFs = np.array([], dtype=np.dtype('int'))
    for progName in processedInfo.keys():
        scanFs = np.append(
            scanFs,
            np.array(processedInfo[progName].keys(), dtype=np.dtype('int')))

    return np.unique(scanFs)


if __name__ == '__main__':
    options = ArgLib.parse([
        'init', 'lads', 'sequest', 'mascot', 'pepnovo', 'output', 'database',
        'symbolmap', 'pnovo', 'peaks', 'combined'
    ])

    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(DataFile.searchprogs, options)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)

    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)

    dbDict = DataFile.getDBInfo(options.database)
    processedInfo = {}
    if options.lads:
        LADSdict = eval(options.lads)
        for tdvfile in LADSdict.keys():
from Models import Experiment, Result, Fraction

from sqlalchemy import create_engine, func
from sqlalchemy.sql import select, and_

experiment = Experiment.__table__
fraction = Fraction.__table__
result = Result.__table__

from collections import defaultdict

if __name__ == '__main__':
    print 'output is the name of file to write the report to'
    print 'experimentname is the experiment to check for integrity'
    print 'dtadir is directory of parent project'
    options = ArgLib.parse(['dtadir', 'experimentname', 'output'])
    sqlitedb_loc = os.path.join(options.dtadir, 'results.db')
    engine = create_engine('sqlite:///' + sqlitedb_loc, echo=True)
    conn = engine.connect()
    experiment_name = options.experimentname
    append_string = ''
    try:
        experiment_id = conn.execute(
            select([
                experiment.c.id
            ]).where(experiment.c.name == experiment_name)).fetchone()[0]
    except TypeError:
        append_string = '.ERROR'
        experiment_id = 0
    experiment_dir = os.path.join(options.dtadir, experiment_name)
    #Get all the fraction numbers
    for line in modelFile:
        if line[0] == '#':
            continue

        weakLearners = line.split(' ')
        for learner in weakLearners:
            learnerTuple = learner.split(':')
            rankingModel += [(int(learnerTuple[0]), float(learnerTuple[1]), float(learnerTuple[2]))]

    return rankingModel

            

if __name__ == '__main__':
    print 'This program generates a results file containing Raw lads output postscored with the algorithm of choice. The discmodel is a supplied model, if necessary for the postscoring algorithm'
    options = ArgLib.parse(['init', 'ppmstd', 'dtadir', 'lads', 'sequest', 'config', 'model', 'output', 'symbolmap'], optArgs=[{'opts': ('-D', '--discmodel'), 'attrs': {'type': 'string', 'dest': 'discmodel', 'help': 'Model used to calculate discriminant score'}}, {'opts': ('-P', '--pairconfig'), 'attrs': {'type': 'string', 'dest': 'pairconfig', 'help': 'Name of LADS Pair Configuration'}}, {'opts': ('-F', '--featurelist'), 'attrs': {'type': 'string', 'dest': 'featurelist', 'help': 'File containing pickled list of desired features (optional)'}}])
    parent = os.path.abspath(os.pardir)
                           
    PNet = PN.ProbNetwork(options.config, options.model)
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    pairConfigurations = paramsDict['Pair Configurations']

    LADSSeqInfo = GLFD.parseSequenceDTAsLogfile(options.lads)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    seqMap = seqMap['LADS Unit Test']

    if options.featurelist:
    writeSequence(outFile, seqName, fullSequence[startInd:endInd], lineLength=lineLength)

def writeSequence(outFile, seqName, sequence, lineLength=60):
    outFile.write(seqName + '\n')
    for i in range(0, len(sequence), lineLength):
        outFile.write(sequence[i:i+lineLength] + '\n')

def getTransSeqNameForGFY(seqName, frame, addStringToBase=''):
    seqNameList = seqName.split(' ')
    seqNameList[0] = seqNameList[0] + ('_+' if frame > 0 else '_') + str(frame) + addStringToBase
    return ' '.join(seqNameList) + ' frame=%i' %(frame,)
    

if __name__ == '__main__':
    print 'This program will take a FASTA file from the --lads argument and output a six-frame translation of the file to output. Number refers to maximum size of sequence in resulting FASTA file. If a chromosomal region exceeds this length with no stop codons, the sequence will be chunked with a 100 aa overhang at each edge. Minimum Length of peptide in FASTA file is 5.'
    options = ArgLib.parse(['lads', 'output', 'number'])

    outFile = open(options.output, 'w')
    #chunkSize = int(options.number)
    
    for seqName, sequence in sequenceGenerator(options.lads):
        for frame in [1, 2, 3, -1, -2, -3]:
        #for frame in [1,2,3]:
            #print seqName, frame
            transSeq = getTranslation(sequence.upper(), frame)
            chunkAndWriteSequence(outFile, seqName, transSeq, frame, len(sequence), lineLength=60, chunkSize=2000, dontReadThrough=['X'], minPeptLength=10, overhang=50)
            #transSeqName = seqName + ('_+' if frame > 0 else '_') + str(frame)
            #transSeqName = getTransSeqNameForGFY(seqName, frame)
            #writeSequence(outFile, transSeqName, transSeq)
            
    outFile.close()
        mod_counts[mod]['Unique'].add(context)

    for mod in sorted(mod_counts, key=lambda k: -len(mod_counts[k]['Unique'])):
        outFile.write('\t'.join([
            str(mod),
            str(mod_counts[mod]['Total']),
            str(len(mod_counts[mod]['Unique']))
        ]) + '\n')

    outFile.close()


if __name__ == '__main__':
    print 'modmaxcounts argument is number of iterations in initial EM over all results, maxcounts argument indicates maximum number of iterations before expectation-maximization terminates after reranking is completed (on the top ranked results only). Set fracs to \"all\" to run over all fractions for experiment or supply desired fracs to run EM over separated by commas'
    options = ArgLib.parse([
        'init', 'dtadir', 'sqlitedb', 'experimentname', 'fracs', 'model',
        'config', 'modtolerance', 'ppmstd', 'output'
    ])

    params_dict = ArgLib.parseInitFile(options.init, options)

    engine = create_engine('sqlite:///' + options.sqlitedb, echo=True)
    conn = engine.connect()
    conn.execute("PRAGMA max_page_count = max_page;")
    conn.execute("PRAGMA temp_store = 2;")
    conn.execute("PRAGMA page_size")

    out_base = os.path.splitext(options.output)[0]

    p_net = PN.ProbNetwork(options.config, options.model)

    try:
Example #21
0
def getPeptideContext(extended_sequence, start=None, end=None):
    return extended_sequence[
        start -
        1] + '.' + extended_sequence[start:end] + '.' + extended_sequence[end]


def getStartAndEndInds(proteinSeq, peptide):
    startInd = proteinSeq.index(peptide)
    return startInd, startInd + len(peptide)


if __name__ == '__main__':
    start_time = time.time()

    options = ArgLib.parse([
        'init', 'output', 'ppmstd', 'denovo', 'unimoddict', 'modmaxcounts',
        'maxcounts', 'fmindex', 'scans'
    ])

    # Amount of mass to add to ends of peptides when fetching sequence context for incomplete masses, may move to command line arguments later
    end_buffer = 250
    # Minimum AA mass, used to decide how many amino acids to request from sequence index when attempting to resolve an inexact match
    min_aa_mass = 50
    # Length of matched tags to tile across de novo and database sequences
    tag_length = 2
    # Maximum number of matches to return per scan
    max_candidates = 100

    # Set to Filtered mode
    filter_for_scans = False
    if options.scans != None:
        filter_for_scans = True
            else:
                prev.terminate() 
                curr += 1
        
        p.start()

    for p in processes:
        p.join()

    for l in L:
        for j in l:
            outFile.write(str(j) + '\t')
        outFile.write('\n')

if __name__ == '__main__' :
    options = ArgLib.parse(['init', 'dtadir', 'config', 'model', 'output', 'columns', 'verbose', 'paircutoff', 'ppmsyserror', 'ppmstd', 'ppmpenalty', 'ambigpenalty', 'minedge', 'maxedge', 'alpha', 'subgraphcut', 'symbolmap'])
    epStep = 0.00025
    maxEp = 0.1
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    
    if options.columns:
        with open(options.columns) as fin:
            cols = pickle.load(fin)
    else:
        print 'Using default cols'
        cols = ['light scan', 'heavy scan', 'pair configuration', 'M+H', 'score', 'seq', 'epsilon', 'ambiguous edges', 'num ambig edges']
    
Example #23
0
    return processedInfo

def updateProgDict(fDict, progDict, prog):
    for name in fDict.values():
        progDict[name] = prog

def getAllScanF(processedInfo):
    scanFs = np.array([], dtype=np.dtype('int'))
    for progName in processedInfo.keys():
        scanFs = np.append(scanFs, np.array(processedInfo[progName].keys(), dtype=np.dtype('int')))

    return np.unique(scanFs)


if __name__ == '__main__':
    options = ArgLib.parse(['init', 'lads', 'sequest', 'mascot', 'pepnovo', 'output', 'database', 'symbolmap', 'pnovo', 'peaks', 'combined', 'srchid'])
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(An.searchprogs, options)
    
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)

    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)

    dbDict = DataFile.getDBInfo(options.database)
    processedInfo = {}  
    if options.lads:
        LADSdict = eval(options.lads)
        for tdvfile in LADSdict.keys():
            LADSScanInfo = DataFile.getScanInfo(tdvfile, dbDict['LADS']['fields'], delimiter='\t')