Example #1
0
def findDeltaPairs(dtaList, delta, ppm=5, intEp=20):
    precMassArr = np.zeros((len(dtaList), 2))
    pairs = []
    
    for i in range(len(dtaList)):
        precMassArr[i] = [DataFile.getPrecMassAndCharge(dtaList[i])[0], DataFile.getScanNum(dtaList[i])]

    maxPrecMass = np.max(precMassArr, 0)[0]
    epsilon = ppm * 10**-6 * maxPrecMass
    resolution = epsilon/intEp
    
    hPrecMassArr = copy.copy(precMassArr)
    hPrecMassArr[:,0] = np.round(hPrecMassArr[:,0]/resolution)
    hashedDict = {}
    for elem in hPrecMassArr:
        hInd = int(elem[0])
        for hMass in range(hInd-intEp, hInd+intEp+1):
            try:
                hashedDict[hMass] += [(hMass-hInd, elem[1])]
            except KeyError:
                hashedDict[hMass] = [(hMass-hInd, elem[1])]
    
    shiftHashDict = copy.copy(precMassArr)
    shiftHashDict[:,0] = np.round((shiftHashDict[:,0] - delta)/resolution)
    for i, elem in enumerate(shiftHashDict):
        hInd = int(elem[0])
        if hInd in hashedDict:
            for possiblePair in hashedDict[hInd]:
                if abs(possiblePair[0]) * resolution * 10**6/precMassArr[i][0] < ppm:
                    pairs += [(int(possiblePair[1]), int(elem[1]))]

    return pairs
Example #2
0
def main():
    pic_dir, nginx_dir = gen_pic_dir(pic_dir_prefix)
    get_word(get_word_loc, word_file)

    index = 1
    report_content = ""
    mail_title = Template.html_h3_title("如下查询结果可能有乱码,请确认")
    mail_title = Template.html_h3_title("本次运行的截图目录为:%s" % nginx_dir)
    mail_res = ""

    for word in word_list:
        print("process %d word" % index)
        try:
            # ready screenshot
            tmp_list = word.split()
            query = tmp_list[0]
            vrid = tmp_list[1]
            _craw_url = url_prefix + quote(query)
            vrstr = 'div.vrResult[id*="sogou_vr_' + vrid + '"],div.vrResult[id*="sogou_vr_kmap_' + vrid + '"]'
            vrstr = quote(vrstr)
            picname = pic_dir + "/" + "_".join(
                [str(index), quote(query), vrid + ".png"])
            nodejs_scrpit = 'spec-selector.js'
            path = '/search/odin/yinjingjing/python/garbled_detector/'


            child = subprocess.Popen(['/bin/node', nodejs_scrpit, \
                                      '-t', 'android', '-m', 'css', \
                                      '-k', vrstr, '-n', picname, \
                                      '-u', quote(_craw_url)], shell=False, \
                                      cwd = path, stdout=subprocess.PIPE)
            nodejs_res = child.stdout.read()
            if nodejs_res != b'0\n':
                utf8stdout("pupppeter ERROR. query:%s, vrid:%s, error:%s" %
                           (query, vrid, nodejs_res))
                continue
            else:
                garble, res = check_garbled(query, picname)
                utf8stdout(
                    "query:%s, vrid:%s, gDetect-api result:%s, is_garble:%s" %
                    (query, vrid, res, garble))
                if garble:
                    mail_info = "index:%d, query:%s, vrid:%s, gDetect-api result:%s" % (
                        index, query, vrid, res)
                    mail_res += "<p>" + mail_info + "</p>\n"

            child.wait()

            index = index + 1

        except Exception as err:
            print(err)
            index = index + 1
            continue
    #当检测有乱码结果时才发邮件
    #if mail_res:
    # utf8stdout("mail_res is not null, Send mail")
    report_content = mail_title + mail_res
    DataFile.write_full_file(report_tmp_path, report_content)
    Mail.sendMail("立知&图谱结果乱码检测", report_tmp_path, mail_to)
Example #3
0
def getPairedAndUnpairedSpectra(dtaDir, dtaList, Nmod, Cmod, ppm=5, cutOff=0.1, verbose=False):
    specPairs = []
    unpairedSpecs = []
    delta = Nmod + Cmod
    for i in range(len(dtaList)):
        paired = False
        precMass1 = DataFile.getPrecMassAndCharge(dtaList[i])[0]
        spec1 = DataFile.getMassIntPairs(dtaList[i])
        for j in range(i + 1, len(dtaList)):
            precMass2 = DataFile.getPrecMassAndCharge(dtaList[j])[0]
            epsilon = ppm * 10 ** -6 * max(precMass1, precMass2)
            if np.abs(np.abs(precMass1 - precMass2) - delta) < epsilon:
                spec2 = DataFile.getMassIntPairs(dtaList[j])
                if precMass1 < precMass2:
                    N, C = SA.getNandCIons(spec1, spec2, Nmod, Cmod, epsilon=epsilon)
                    ratio = SA.getSharedPeaksRatio(spec1, spec2, N, C)
                else:
                    N, C = SA.getNandCIons(spec2, spec1, Nmod, Cmod, epsilon=epsilon)
                    ratio = SA.getSharedPeaksRatio(spec2, spec1, N, C)
                if ratio > cutOff:
                    if verbose:
                        print 'Pair found', dtaList[i], dtaList[j]
                    paired = True
                    specs = (dtaList[i], dtaList[j])
                    lightInd = int(precMass2 < precMass1)
                    specPairs.extend([(ratio, specs[lightInd], specs[1 - lightInd])])
        
        if not paired:
            unpairedSpecs.extend([dtaList[i]])
            if verbose:
                print 'No pairs for', dtaList[i]
    return specPairs, unpairedSpecs
Example #4
0
def getAlignmentRatios(scanInfoFName, dtaDir, delta, epsilon=0.02):
    scanInfo = DataFile.getScanInfo(scanInfoFName)
    dtaNames = DataFile.getDTAFNamesInDir(dtaDir)
    
    scansToUse = scanInfo
    """
    for i in range(len(scanInfo) - 1):
        if (int(scanInfo[i][0]) + 1 == int(scanInfo[i+1][0])):
            if (scanInfo[i][1] == scanInfo[i+1][1]):
                scansToUse += [scanInfo[i]]
        else:
            scansToUse += [scanInfo[i]]
    """
    ratios = []
    goodRatios = []
    for i in range(len(scansToUse)):
        for j in range(i + 1, len(scansToUse)):
            if j == i + 1:
                print '%s percent done' % str(float(i) / len(scansToUse))
            if np.abs(np.abs(float(scansToUse[i][1]) - float(scansToUse[j][1])) - delta) < epsilon:
                dta1 = '244.%(scanF)04i.%(scanF)04i.1.dta' % {'scanF': int(scansToUse[i][0])}
                dta2 = '244.%(scanF)04i.%(scanF)04i.1.dta' % {'scanF': int(scansToUse[j][0])}
                spec1 = DataFile.getMassIntPairs(dtaDir + dta1)
                spec2 = DataFile.getMassIntPairs(dtaDir + dta2)
                ratio = SA.getSharedPeaksRatio(float(scansToUse[i][1]), spec1, float(scansToUse[j][1]), spec2, epsilon)
                print ratio, scansToUse[i], scansToUse[j]
                ratios.extend([(ratio, scansToUse[i], scansToUse[j])])

    with open('heavylightpairs.txt', 'w') as fout:
        pickle.dump(ratios, fout)
    return ratios
Example #5
0
def findSamePrecMassClusters(dtaList, ppm=5):
    precMassArr = np.zeros((len(dtaList), 2))
    for i in range(len(dtaList)):
        precMassArr[i] = [DataFile.getPrecMassAndCharge(dtaList[i])[0], DataFile.getScanNum(dtaList[i])]
        
    precMassArr = precMassArr[np.argsort(precMassArr[:,0])]
    
    clusters = [[i] for i in range(precMassArr.shape[0])]
    
    i = 0
    while i < len(clusters):
        mergeClusters = False
        epsilon = ppm * 10**-6 * precMassArr[clusters[i][0]][0]
        for precMassInd1 in clusters[i]:
            for precMassInd2 in clusters[i - 1]:
                if (np.abs(precMassArr[precMassInd1][0] - precMassArr[precMassInd2][0]) < epsilon):
                    mergeClusters = True
                    break
            
        if mergeClusters:
            clusters[i - 1].extend(clusters[i])
            del clusters[i]
        else:
            i = i + 1
    
    scanFClusters = []
    for cluster in clusters:
        scanFClusters += [[precMassArr[i][1] for i in cluster]]
       
    return scanFClusters
Example #6
0
async def _action_combo_get_page_content(url, cookies_dir='data/cookies/'):
    try:
        #解析url属于那个domain
        parsed_uri = urlparse(url)
        cookies_file = "".join([cookies_dir, parsed_uri.netloc, "cookie"])
        my_cookie_file = DataFile.read_file_intostr(cookies_file)
        browser = await launch({
            "executablePath": "chromium-browser",
            "args": ["--no-sandbox"]
        })
        page = await browser.newPage()
        #读取cookies
        if (len(my_cookie_file) > 0):
            my_cookie_object = json.loads(my_cookie_file)
            print("".join(
                ["Load ",
                 str(len(my_cookie_object)), " cookie item(s)."]))
            for row in my_cookie_object:
                await page.setCookie(row)
        #设置UA
        ua_box = UserAgent.UserAgentBox()
        await page.setUserAgent(ua_box.wap_normal_user)
        await page.goto(url)
        new_cookie = await page.cookies()
        json_cookie = json.dumps(new_cookie)
        res = await action_get_page_content(page)
        DataFile.write_full_file(cookies_file, json_cookie)
        await browser.close()
        return res
    except Exception as e:
        traceback.print_exc()
        return ""
Example #7
0
def main():
    arr_lst = ['name', 'allname', 'year']
    get_word(get_word_loc, word_file)

    index = 1
    report_content = ""
    mail_title = Template.html_h3_title("如下查询结果可能有乱码,请确认")
    mail_res = ""

    for word in word_list:
        utf8stdout("process %d word" % index)
        try:
            # ready get qa_text
            tmp_list = word.split()
            query = tmp_list[0]
            vrid = tmp_list[1]
            response = get_response(query, 'wap', '1')

            if not response:
                utf8stdout("source response is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            extract_text = get_att_name(query, response, arr_lst)
            utf8stdout("extract_text:%s" % extract_text)
            if not extract_text:
                utf8stdout("extract text is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            garble, res = check_garbled(query, extract_text)
            utf8stdout(
                "query:%s, vrid:%s, gDetect-api result:%s, is_garble:%s" %
                (query, vrid, res, garble))
            if garble:
                f_res.write("index:%d, query:%s, vrid:%s\n" %
                            (index, query, vrid))
                f_res.write("gDetect-api result:%s\n" % res)
                f_res.write("extract_text:%s\n" % extract_text)
                f_res.write('\n')

            index = index + 1

        except Exception as err:
            utf8stdout(err)
            index = index + 1
            continue

    f_res.close()
    #当检测有乱码结果时才发邮件
    if os.path.getsize(result_file) > 0:
        report_content = mail_title + mail_res
        DataFile.write_full_file(report_tmp_path, report_content)
        Mail.sendMail("图谱推荐结果乱码检测",
                      report_tmp_path,
                      mail_to,
                      attachment=result_file)
def getScanFDict(dtaList):
    scanFDict = {}
    for dta in dtaList:
        scanF = DataFile.getScanNum(dta)
        precMass = DataFile.getPrecMassAndCharge(dta)[0]
        scanFDict[scanF] = {"dta": dta, "precMass": precMass, "sequenced": False}

    return scanFDict
def getScanFDict(dtaList):
    scanFDict = {}
    for dta in dtaList:
        scanF = DataFile.getScanNum(dta)
        precMass = DataFile.getPrecMassAndCharge(dta)[0]
        scanFDict[scanF] = {'dta': dta, 'precMass': precMass, 'sequenced': False}
    
    return scanFDict
Example #10
0
def main():
    get_word(get_word_loc, word_file)

    index = 1
    report_content = ""
    mail_title = Template.html_h3_title("附件结果可能有乱码,请确认")

    for word in word_list:
        utf8stdout("process %d word" % index)
        try:
            # ready get qa_text
            tmp_list = word.split()
            query = tmp_list[0]
            vrid = tmp_list[1]

            node = Node(query, vrid)
            node.gen_url()
            node.get_response()
            if not node.html:
                utf8stdout("source html is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            node.get_qa_text()
            if not node.qa_text:
                utf8stdout("qa text is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            node.check_garbled()
            node_res = node.output_garble()

            if node_res:
                f_res.write("index:%d, query:%s, vrid:%s\n" %
                            (index, query, vrid))
                f_res.write("gDetect-api result:%s\n" % node.garble_res)
                f_res.write("qa_text:%s\n" % node.qa_text)
                f_res.write('\n')

            index = index + 1

        except Exception as err:
            utf8stdout(err)
            index = index + 1
            continue

    f_res.close()

    #当检测有乱码时才发送邮件
    if os.path.getsize(result_file) > 0:
        report_content = mail_title
        DataFile.write_full_file(report_tmp_path, report_content)
        Mail.sendMail("立知问答结果乱码检测",
                      report_tmp_path,
                      mail_to,
                      attachment=result_file)
Example #11
0
def main():
    get_word(get_word_loc, word_file)

    index = 1
    report_content = ""
    mail_title = Template.html_h3_title("附件结果可能有乱码,请确认")

    for word in word_list:
        utf8stdout("process %d word" % index)
        try:
            # ready get qa_text
            tmp_list = word.split()
            query = tmp_list[0]
            vrid = tmp_list[1]
            html = get_response(query)
            if not html:
                utf8stdout("source html is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            qa_text = get_qa_text(query, html)
            if not qa_text:
                utf8stdout("qa text is null. query:%s, vrid:%s" %
                           (query, vrid))
                index = index + 1
                continue

            garble, res = check_garbled(query, qa_text)
            utf8stdout("query:%s, vrid:%s, is_garble:%s" %
                       (query, vrid, garble))
            utf8stdout("gDetect-api result:%s" % res)
            utf8stdout("qa_text:%s" % qa_text)
            if garble:
                f_res.write("index:%d, query:%s, vrid:%s\n" %
                            (index, query, vrid))
                f_res.write("gDetect-api result:%s\n" % res)
                f_res.write("qa_text:%s\n" % qa_text)
                f_res.write('\n')

            index = index + 1

        except Exception as err:
            utf8stdout(err)
            index = index + 1
            continue
    #当检测有乱码结果时才发邮件
    #if mail_res:
    # utf8stdout("mail_res is not null, Send mail")
    f_res.close()
    report_content = mail_title
    DataFile.write_full_file(report_tmp_path, report_content)
    Mail.sendMail("立知问答结果乱码检测",
                  report_tmp_path,
                  mail_to,
                  attachment=result_file)
def send_mail(task_id):
    try:
        report_content = ""
        url = "http://fs.sogou/lizhi_accu_compare/mission_list/" + str(
            task_id) + "/"
        mail_title = Template.html_h3_title("立知结果精度对比运行完毕,请对结果进行标注:")
        mail_content = Template.html_p(url)

        report_content = mail_title + mail_content
        DataFile.write_full_file(report_tmp_path, report_content)
        Mail.sendMail("立知结果精度对比运行完毕,请对结果进行标注", report_tmp_path, mail_to)
    except Exception as err:
        print("[send_mail]:%s" % err)
def getPairs(pairs, xVals):
    for pair in pairs:
        lightSpecs = [DataFile.getMassIntPairs(scanFDict[lightScanF]['dta']) for lightScanF in samePeptideClusters[pair[0]]]
        heavySpecs = [DataFile.getMassIntPairs(scanFDict[heavyScanF]['dta']) for heavyScanF in samePeptideClusters[pair[1]]]
        lightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in samePeptideClusters[pair[0]]]))

        epSTD = (float(paramsDict['ppmstd']['value'])) * 10 ** -6 * lightPrecMass

        lightMergedSpec = SA.mergeSpectra(lightSpecs, epsilon=2*epSTD)
        heavyMergedSpec = SA.mergeSpectra(heavySpecs, epsilon=2*epSTD)

        svmClassificationData = SA.getSpectraPairInfoForSVMClassification(lightMergedSpec, heavyMergedSpec, lightPrecMass, NMod=pairConfig['NMod'], CMod=pairConfig['CMod'], epsilon=2*epSTD)
        xVals.put([svmClassificationData])
    
    return xVals
Example #14
0
def parseDBScans(fDict, prog, seqMap, dbDict):
    processedInfo = {}
    for csvfile in fDict.keys():
        MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'], delimiter=',')
        processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'])
    
    return processedInfo
Example #15
0
def getSamePeptideClusters(precMassClusters, scanFDict, svmModel, svmRange, ppmSTD=5, cutOff=0):
    trueClusters = []
    for cluster in precMassClusters:
        if len(cluster) == 1:
            trueClusters += [cluster]
        else:
#            print 'testing cluster', cluster
            pairIndex = []
            xVals = []
            specs = []
            for i in range(len(cluster)):
                specs +=  [DataFile.getMassIntPairs(scanFDict[cluster[i]]['dta'])]
                
            dMatrix = np.ones((len(cluster), len(cluster))) * -2
            for i in range(len(cluster)):
                for j in range(i+1, len(cluster)):
                    epSTD = ppmSTD * 10 ** -6 * scanFDict[cluster[i]]['precMass']
            
                    SVMClassificationInfo = SA.getSpectraPairInfoForSVMClassification(specs[i], specs[j], scanFDict[cluster[i]]['precMass'], NMod=0, CMod=0, epsilon=2*epSTD)
                    xVals += [SVMClassificationInfo]
                    pairIndex += [(i, j)]
            
            xValsNorm = svmutil.normalize_instances(xVals, svmRange)
            pLabs = svmutil.svm_predict([0]*len(xValsNorm), xValsNorm, svmModel)[0]
#            print pLabs
            for i, pLab in enumerate(pLabs):
            # Scale distances by 4: totalTICRatio, 1: TotalSharedPeaksRatio
                dMatrix[pairIndex[i][0]][pairIndex[i][1]] =  dMatrix[pairIndex[i][1]][pairIndex[i][0]] = xVals[i][1] if pLab==1 else -1

            trueClusters += heirarchicalClusteringAverageLinkage([[scanF] for scanF in cluster], dMatrix, cutOff=cutOff)
    
    return trueClusters
Example #16
0
def parseScans(fDict, prog, seqMap, dbDict, delimiter=',', srchID = None, seqDelimLen=2):
    processedInfo = {}
    for csvfile in fDict.keys():
        MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'] + (['SrchID'] if srchID != None else []), delimiter=delimiter)
        processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'], srchID = srchID, seqDelimLen=seqDelimLen)
    
    return processedInfo
Example #17
0
def analiseMedia(acao, nome):
        Opcao = ''
        Cor = ''
        dG = DataFile(nome, acao)
        sma7 = dG.getData(7)
        sma21 = dG.getData(21)
        label, sma7 = exportLastDataPlot(sma7,'SMA')
        label, sma21 = exportLastDataPlot(sma21, 'SMA')
        if sma7[-1] < sma21[-1]:
                opcao = 'Comprar'
                cor = 'green'
        elif sma7[-1] > sma21[-1]:
                opcao = 'Vender'
                cor = 'red'
        else:
                opcao = 'Incerteza, não faça nada'
                cor = 'grey'
        return opcao, cor, sma21[-1]
Example #18
0
def importTAGGRAPHResults(connection, experiment_name, fraction_name, taggraph_files, max_batch_size = 500000):

    try:
        experiment_id = connection.execute(select([experiment.c.id]).where(experiment.c.name == experiment_name)).fetchone()[0]
    except TypeError:
        raise ValueError("ERROR: No experiment by name %s"%experiment_name)

    try:
        fraction_id = connection.execute(select([fraction.c.id]).where(and_(fraction.c.name == fraction_name, fraction.c.experiment_id == experiment_id))).fetchone()[0]
        print "Using existing fraction %s in database for experiment"%str((fraction_id, fraction_name))
    except TypeError:
        print 'FRAC NOT FOUND, CREATING NEW FRAC', fraction_name, experiment_id
        res = connection.execute(fraction.insert().values(name=fraction_name, experiment_id=experiment_id))
        fraction_id = res.inserted_primary_key[0]

    connection.execute(fraction.update().where(fraction.c.id=='fraction_id').values(taggraph_file=str(taggraph_files)))

    values = []
    for taggraph_file in taggraph_files:
        taggraph_info = DataFile.getScanInfoIterator(taggraph_file, delimiter='\t')
        for item in taggraph_info:
            values += [{
                "scan": item['ScanF'],
                "charge": item['Charge'],
                "obs_mh": item['Obs M+H'],
                "theo_mh": item['Theo M+H'],
                "ppm": item['PPM'],
                "retention_time": item['RT'],
                "alignment_score": item['Alignment Score'],
                "spectrum_score": item['Spectrum Probability Score'],
                "composite_score": item['Composite Score'],
                "context": item['Context'],
                "mod_context": item['Mod Context'],
                "mods": item['Match Modifications'],
                "mod_ranges": item['Mod Ranges'],
                "mod_ambig_edges": item['Mod Ambig Edges'],
                "proteins": item['Proteins'],
                "matching_tag_length": item['Matching Tag Length'],
                "time_taken": item['Time Taken'],
                "de_novo_peptide": item['De Novo Peptide'],
                "unmod_de_novo_peptide": item['Unmod De Novo Peptide'],
                "de_novo_score": item['De Novo Score'],
                "num_matches": item['Num Matches'],
                "fraction_id": fraction_id
                
                }]

            if len(values) > max_batch_size:
                res = connection.execute(result.insert(), values)
                values = []
        
        #fraction.results.extend([Result(scan=item['ScanF'], alignment_score=item['Alignment Score'], spectrum_score=item['Spectrum Probability Score'], composite_score=item['Composite Score'], context=item['Context'], mod_context=item['Mod Context'], mods=item['Match Modifications'], mod_ambig_edges=item['Mod Ambig Edges'], proteins=item['Proteins'], matching_tag_length=item['Matching Tag Length'], time_taken=item['Time Taken'], de_novo_peptide=item['De Novo Peptide'], unmod_de_novo_peptide=item['Unmod De Novo Peptide'], de_novo_score=item['De Novo Score'], num_matches=item['Num Matches'])])
    if len(values) > 0:
        res = connection.execute(result.insert(), values)
    
    return True
Example #19
0
def parseScans(fDict,
               prog,
               seqMap,
               dbDict,
               delimiter=',',
               srchID=None,
               seqDelimLen=2):
    processedInfo = {}
    for csvfile in fDict.keys():
        cols, data = DataFile.getScanInfo(
            csvfile,
            dbDict[prog]['fields'] + (['SrchID'] if srchID != None else []),
            delimiter=delimiter)
        processedInfo[fDict[csvfile]] = DataFile.preprocessDatabaseScanInfo(
            data,
            seqMap[fDict[csvfile]],
            dbDict[prog]['fieldmap'],
            seqDelimLen=seqDelimLen)

    return processedInfo
Example #20
0
def parseInitFile(init, options):
    A=setupParams()
    paramsDict = DataFile.parseParams(init)
    for param in paramsDict['Parameters'].keys():
        try:
            paramType = A[param]['attrs']['type']
            val = init['Parameters'][param]
            if paramType != 'string':
                val = getattr('__builtin__', paramType)(val)
            setattr(options, param, val)
        except KeyError:
            pass
    return paramsDict
Example #21
0
def get_taxons_at_score_percent_cutoff(get_taxons_file,
                                       score_percent_cutoff=0.001):
    taxons = []
    all_pepts = set()
    cols, data = DataFile.getScanInfo(get_taxons_file, delimiter='\t')

    for item in data:
        all_pepts |= eval(item['Peptide Cover'])

    for item in data:
        if float(item['Score']) / len(all_pepts) >= score_percent_cutoff:
            taxons += [item['Taxon']]

    return taxons
Example #22
0
def parseInitFile(init, options):
    paramsDict = DataFile.parseParams(init)
    for param in paramsDict['LADS Parameters'].keys():
        try:
            paramType = A[param]['attrs']['type']
            val = init['LADS Parameters'][param]
            if paramType != 'string':
                val = getattr('__builtin__', paramType)(val)
            
            setattr(options, param, val)
        except KeyError:
            pass
    
    return paramsDict
Example #23
0
def getLADSPScore(seq, dtaPath, PNet, ppm=5, ambigEdges=None, ambigAA='X', ambigPenalty=20):
    pairs = DataFile.getMassIntPairs(dtaPath)
    precMass = DataFile.getPrecMassAndCharge(dtaPath)[0]
    epsilon = ppm * precMass * 10 ** -6
    spec = PN.Spectrum(PNet, precMass, Nmod=0, Cmod=0, epsilon=epsilon, spectrum=pairs)
    spec.initializeNoiseModel()
    nodeGen = Constants.nodeInfoGen(seq, considerTerminalMods=True, ambigEdges=ambigEdges)
    pScore = 0
    node = nodeGen.next()
    pScore += spec.getNodeScore(**node)
    pScore += spec.getPriorScore(prm=0, formAA=None, lattAA=node['formAA'])
    if node['formAA'] == ambigAA:
        pScore -= ambigPenalty
        
    for node in nodeGen:
        pScore += spec.getNodeScore(**node)
        if node['formAA'] == ambigAA:
            pScore -= ambigPenalty
            
    pScore += spec.getPriorScore(prm=precMass- Constants.mods['H+'] - Constants.mods['H2O'], formAA=node['lattAA'], lattAA=None)
    if node['lattAA'] == ambigAA:
        pScore -= ambigPenalty
    
    return pScore  
Example #24
0
def CreateBPlusTree(filename, Maxkeys):
    '''
    Objective        : To create a B+ Tree of records.
    Input Parameters :
            filename -> Name of file whose records are used to create B+ Tree.
                   b -> Object of BpluTree
    Output           :  None
    '''
    file = open(filename, 'rb')
    b = BplusTree(Maxkeys)
    i = 0
    while True:
        try:
            record = pickle.load(file)
            key = DataFile.RecordKey(record)
            b.Insert((key, i))
            i += 1
        except EOFError:
            break
    file.close()
    return b
Example #25
0
            compInfo[int(scan['ScanF'])] = scanInfo
    
    return compInfo

def scatterPlot(compInfo, axis1, axis2):
    axis1Vals = []
    axis2Vals = []
    for scanF in compInfo.keys():
        if compInfo[scanF][axis1] != 'None' and compInfo[scanF][axis2] != 'None':
            axis1Vals.extend([float(compInfo[scanF][axis1])])
            axis2Vals.extend([float(compInfo[scanF][axis2])])
    
    plt.scatter(axis1Vals, axis2Vals)
    plt.xlabel(axis1)
    plt.ylabel(axis2)
    plt.show()
    
if __name__ == '__main__':
    scansfName = 'compareSearches_MASCOT_LADSUPen10KPen15All_SEQUEST_ath001862.tdv'
    scansInfo = DataFile.getScanInfo(scansfName, delimiter='\t')
    
    infoMap = {'MASCOT': {'Score': 'Ion Score', 'Peptide': 'Peptide', 'Reference': 'Reference'},
               'SEQUEST': {'Score': 'XCorr', 'Peptide': 'Peptide', 'Reference': 'Reference'},
               'LADS': {'Score': 'PScore', 'Peptide': 'Peptide', 'Reference': 'Reference'}}
    
    compInfo = getScanComparisonInfo(scansInfo, infoMap)
    scatterPlot(compInfo, 'SEQUEST XCorr', 'LADS PScore')
    
    
    
Example #26
0
def getCompStats(compSearchPath, mainProgName, progDict, infoMap, paramsDict, mainProgFields=['PScore', 'Num Ambig Edges'], getPairStats=True):
    compSearchInfo = DataFile.getScanInfo(compSearchPath, delimiter='\t')
    unpaired = {}
    other = {}
    stats = {}

    for progName, prog in progDict.items():
        if progName == mainProgName:
            continue
        
        unpaired[progName] = {'accuracyVec': np.array([]), 'precisionVec': np.array([]), 'numScans': 0}
        accName, precName = getAccuracyAndPrecisionNames(progName, mainProgName, compSearchInfo[0])
        stats[progName] = {}
        stats[progName]['accName'] = accName
        stats[progName]['precName'] = precName
        for progfield in mainProgFields:
            unpaired[progName][progfield] = np.array([])
        other[progName] = copy.deepcopy(unpaired[progName])

    pairsDict = {}
    if getPairStats:
        truePairs = {}
        falsePairs = {}
        compInfo = getScanComparisonInfo(compSearchInfo, infoMap, progDict, scanFields=['Score', 'Peptide', 'Obs M+H'], specificColDict={'LADS': ['Num Ambig Edges', 'Paired Spectrum', 'Pair Configuration']})
        for pairConfigName in paramsDict['Pair Configurations']:
            truePairs[pairConfigName] = {}
            falsePairs[pairConfigName] = {}
            pairsDict[pairConfigName] = {}
            for progName in progDict.keys():
                if progName == mainProgName:
                    continue
                pairsDict[pairConfigName][progName] = findPairsInSearchResults(compInfo, infoMap, progDict, paramsDict['Pair Configurations'][pairConfigName], progName=progName)
                truePairs[pairConfigName][progName] = copy.deepcopy(unpaired[progName])
                falsePairs[pairConfigName][progName] = copy.deepcopy(unpaired[progName])

    print 'Compiling stats'
    for scan in compSearchInfo:
        scanF1 = int(scan['ScanF'])
        pairType = determinePairType(pairsDict, scan, progDict, infoMap, mainProgName)
        if pairType == None:
            temp = unpaired
        elif pairType:
            temp = truePairs[scan[mainProgName + ' Pair Configuration'].lower()]
        else:
            temp = falsePairs[scan[mainProgName + ' Pair Configuration'].lower()]
        for progName in stats.keys():
            try:
                if scan[progName + ' ' + infoMap[progDict[progName]]['Score']] != 'None':
                    temp[progName]['numScans'] += 1
                temp[progName]['accuracyVec'] = np.append(temp[progName]['accuracyVec'], float(scan[stats[progName]['accName']]))
                temp[progName]['precisionVec'] = np.append(temp[progName]['precisionVec'], float(scan[stats[progName]['precName']]))
                for progfield in mainProgFields:
                    temp[progName][progfield] = np.append(temp[progName][progfield], float(scan[mainProgName + ' ' + progfield]))
            except ValueError:
                other[progName]['numScans'] += 1
                for progfield in mainProgFields:
                    try:
                        other[progName][progfield] = np.append(other[progName][progfield], float(scan[mainProgName + ' ' + progfield]))
                    except ValueError:
                        print 'ERROR in getting main %s data for scan %s, peptide %s, %s %s' % (mainProgName, scan['ScanF'], scan[mainProgName + ' ' + infoMap[progDict[mainProgName]]['Peptide']], progfield, scan[mainProgName + ' ' + progfield])
                        pass
    
    for progName in stats.keys():
        if getPairStats:
            stats[progName]['truepairs'] = {}
            stats[progName]['falsepairs'] = {}
            stats[progName]['pairsDict'] = {}
            stats[progName]['unpaired'] = unpaired[progName]
            stats[progName]['other'] = other[progName]
            stats[progName]['composite'] = {}
            for pairConfigName in truePairs:
                stats[progName]['truepairs'][pairConfigName] = truePairs[pairConfigName][progName]
                stats[progName]['falsepairs'][pairConfigName] = falsePairs[pairConfigName][progName]
                stats[progName]['pairsDict'][pairConfigName] = pairsDict[pairConfigName][progName]
                
            for field in stats[progName]['unpaired']:
                try:
                    truePairsComp = np.concatenate([stats[progName]['truepairs'][pairConfigName][field] for pairConfigName in stats[progName]['truepairs']])
                    falsePairsComp = np.concatenate([stats[progName]['falsepairs'][pairConfigName][field] for pairConfigName in stats[progName]['falsepairs']])
                    stats[progName]['composite'][field] = np.concatenate((truePairsComp, falsePairsComp, stats[progName]['unpaired'][field]))
                except ValueError:
                    pass
            
            numTruePairs = np.sum([stats[progName]['truepairs'][pairConfigName]['numScans'] for pairConfigName in stats[progName]['truepairs']])
            numFalsePairs = np.sum([stats[progName]['falsepairs'][pairConfigName]['numScans'] for pairConfigName in stats[progName]['falsepairs']])
            stats[progName]['composite']['numScans'] = numTruePairs + numFalsePairs + stats[progName]['unpaired']['numScans']
        else:
            stats[progName]['other'] = other[progName]
            stats[progName]['composite'] = unpaired[progName]

    return stats
def getSequencing(pair, sharedPeaks, paramsDict, outFile, res):
    global print_lock, spectrum_lock

    result = []

    scanData = {}
    lightSpecs = [DataFile.getMassIntPairs(scanFDict[lightScanF]['dta']) for lightScanF in samePeptideClusters[pair[0]]]
    heavySpecs = [DataFile.getMassIntPairs(scanFDict[heavyScanF]['dta']) for heavyScanF in samePeptideClusters[pair[1]]]
    precMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in samePeptideClusters[pair[0]]]))
    
    epMean = options.ppmsyserror * precMass * 10**-6
    epSTD = options.ppmstd * precMass * 10**-6
                
    scanData['shared peaks ratio'] = sharedPeaks

    s1 = time.time()
    sharedInfo, starts, ends, deltas, G = DNS.prepPairedSpectrumGraph(lightSpecs, heavySpecs, precMass, addEnds, ppmSTD=options.ppmstd, Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], verbose=options.verbose)
    scanData['M+H'] = precMass
    
    specs = []
    for massIntPairs in lightSpecs:
        specs += [PN.Spectrum(PNet, precMass, Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)]
    for massIntPairs in heavySpecs:
        specs += [PN.Spectrum(PNet, precMass + pairConfig['NMod'] + pairConfig['CMod'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)]
    for spec in specs:
        spec.initializeNoiseModel()

    # with spectrum_lock:
    temp = DNS.getSpectrumGraphDataThread(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], ambigPenaltyFun, ppmPenaltyFun, hashedAAs, termModHash=termModHash, maxEdge=options.maxedge, minEdge=options.minedge, subGraphCut=options.subgraphcut, subAlpha=0.3, alpha=options.alpha, epMean=epMean, epSTD=epSTD, epStep=epStep, verbose=options.verbose)
    temp_scan = temp[0]
    peps = temp[1]
    scanData.update(temp_scan)
    
    scanData['pair configuration'] = pairConfigName

    with print_lock:
        print 'Now sequencing light scan(s) %s, heavy scan(s) %s with shared peaks ratio %f \n' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), scanData['shared peaks ratio'])
        # out.append('Now sequencing light scan(s) ' + str(samePeptideClusters[pair[0]]) + ', heavy scan(s) ' + str(samePeptideClusters[pair[1]]) + ' with shared peaks ratio ' + str(scanData['shared peaks ratio']) + ' \n' )
        Ord = np.argsort(-1 * np.array(scanData['over_scores']))
        if scanData['blind'] == 0:
            for i in range(min(Ord.size, 10)):
                try:
                    print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]])
                    # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]]))
                except TypeError:
                    print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]]
                    # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + str(peps[1][Ord[i]]))
        elif scanData['blind'] == 1:
            for i in range(min(Ord.size, maxNum)):
                try:
                    print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]][0]), 'Mod Names: ', peps[2][Ord[i]][1]
                    # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]][0]) + ' Mod Names: ' + peps[2][Ord[i]][1])
                except TypeError:
                    print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]][0], 'Mod Names: ', peps[2][1]
                    # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + peps[1][Ord[i]][0] +  ' Mod Names: ' + peps[2][1])
        
        scanData['sequencing time'] = time.time() - s1
        print '\nTime Taken:', time.time() - s1, '\n'    
    # out.append('\nTime Taken: ' + str(time.time() - s1) + '\n')

    if validateHeavySequence(scanData['seq'], heavySeqMap, scanData['ambiguous edges']):
        for scanF in samePeptideClusters[pair[0]] + samePeptideClusters[pair[1]]:
            scanFDict[scanF]['sequenced'] = True
        if options.output:
            for pair in [(lightScanF, heavyScanF) for lightScanF in samePeptideClusters[pair[0]] for heavyScanF in samePeptideClusters[pair[1]]]:
                scanData['light scan'] = int(pair[0])
                scanData['heavy scan'] = int(pair[1])                  
                # outFile.write('\t'.join([str(scanData[col]) for col in cols]) + '\n')
                # print str(scanData[col])
                res.append([str(scanData[col]) for col in cols])
        else:
            print 'WARNING: Invalid sequence! Unsuccessful sequencing of %s and %s with pair configuration %s' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), pairConfigName)

    exit(0)
def addPlausibleCandidatesFromModList(connection,
                                      fracs,
                                      expand_mods,
                                      data_dir,
                                      hashed_single_mods,
                                      hashed_mod_combos,
                                      prob_network,
                                      ep_step=0.01,
                                      mod_tolerance=0.1,
                                      ppmSTD=10,
                                      isobaric_mod_penalty=-0.5,
                                      def_mod_penalty=-1,
                                      indel_penalty=-3,
                                      undef_mod_penalty=-3,
                                      spectrum_score_cutoff=0,
                                      max_per_scan=10):

    for fraction_id, fraction_name in fracs:
        # Load in dta info
        frac_num = int(fraction_name[1:])
        ''' Replace os.path.sep with '/' to fix Windows backslash issues. --smp
        dta_dir = glob.glob(data_dir + os.path.sep + '*f%02d'%frac_num)[0] + os.path.sep
        '''
        dta_dir = glob.glob(data_dir + '/' + '*f%02d' % frac_num)[0] + '/'
        dtaList = glob.glob(dta_dir + '*.dta')
        scanFDict = DataFile.getScanFDict(dtaList)

        # Get TAG-GRAPH results
        stmt = select([
            results.c.scan, results.c.alignment_score, results.c.context,
            results.c.mod_context, results.c.mods, results.c.mod_ranges,
            results.c.mod_ambig_edges, results.c.proteins,
            results.c.matching_tag_length, results.c.de_novo_peptide,
            results.c.unmod_de_novo_peptide, results.c.de_novo_score,
            results.c.num_matches, results.c.obs_mh, results.c.retention_time
        ]).where(results.c.fraction_id == fraction_id).where(
            results.c.spectrum_score > spectrum_score_cutoff).order_by(
                results.c.scan).order_by(desc(results.c.composite_score))
        response = connection.execution_options(
            stream_results=True).execute(stmt)

        indexed_results = defaultdict(list)
        for row in SFW.string_folding_wrapper(response):
            indexed_results[row[0]] += [row[1:]]

        new_scan_items = {}
        for scanF in indexed_results:
            # TODO: Don't add mod candidates for crappy results to save time (use spectrum prob score for this or protease specificity of localization?)
            # Can also do initial rounds of EM and use a probability cutoff (see above idea for gating mod candidates)

            # Eval mod lists and mod ranges once (will be using them over and over)
            mod_lists = []
            mod_ranges_list = []
            mod_tuples_list = []
            enumerated_mods = defaultdict(set)
            exact_match = False
            for item in indexed_results[scanF]:
                mods = eval(item[3])
                mod_lists += [mods]
                mod_ranges = tuple(eval(item[4]))
                mod_ranges_list += [mod_ranges]
                if len(mods) == 0 or all(
                    [mod[0][0] == 'Isobaric Substitution' for mod in mods]):
                    exact_match = True
                    break

                mod_tuples = []
                for j, mod in enumerate(mods):
                    mod_tuple = Validator.getUniqueModTuple(
                        [mod], undef_mod_round_precision=2)[0]
                    enumerated_mods[(item[1], mod_ranges[j])].add(mod_tuple)
                    mod_tuples += [mod_tuple]

                mod_tuples_list += [mod_tuples]

            #print fraction_name, scanF, exact_match
            # Don't add mod candidates if exact match is found
            if exact_match:
                continue

            #if scanF != 5841 or fraction_name != 'F12':
            #    continue

            #print '-----------------------'
            #print indexed_results[scanF]
            # Add mod candidates which can plausibly be the sum of two separate mods
            new_combo_mods = addComboModCandidates(
                scanF, indexed_results[scanF], mod_lists, mod_ranges_list,
                mod_tuples_list, enumerated_mods, scanFDict, expand_mods,
                hashed_mod_combos, prob_network, ep_step, mod_tolerance,
                ppmSTD)
            #print 'Num Combo Mods after getUniqueCandidates', sum([len(val[1]) for val in new_combo_mods])
            #print '---Combo Mods---'
            #print new_combo_mods

            #print enumerated_mods
            new_single_mods = addSingleModCandidates(
                scanF, indexed_results[scanF], mod_lists, mod_ranges_list,
                mod_tuples_list, enumerated_mods, scanFDict, expand_mods,
                hashed_single_mods, prob_network, ep_step, mod_tolerance,
                ppmSTD)
            #print 'Num Single Mods after getUniqueCandidates', sum([len(val[1]) for val in new_single_mods])
            #print '---Single Mods---'
            #print new_single_mods

            new_scan_items[scanF] = new_single_mods + new_combo_mods
            #print scanF, new_scan_items[scanF]

        #print 'Indexed results scans', sorted(indexed_results.keys())
        #print 'new_scan_items scans', sorted(new_scan_items.keys())
        # Import new candidates into DB
        values = []
        for scanF in indexed_results:
            #print scanF, scanF in new_scan_items, int(scanF) in new_scan_items, str(scanF) in new_scan_items
            if scanF in new_scan_items:
                #print 'NUM', len(new_scan_items[scanF])
                indexed_item = indexed_results[scanF][0]
                i = 0
                # Sort candidates by sum of prevalences of mods
                # No need to sort by composite_score, only top scoring mods for each (context, mod_tuple) pair are included in the new_scan_items (filtered using getUniqueCandidates)
                for item in sorted(new_scan_items[scanF],
                                   key=lambda k: -sum(
                                       [expand_mods[mod]
                                        for mod in k[0][1]]) / len(k[0][1])):
                    for candidate in item[1]:
                        candidate.update({
                            "scan":
                            scanF,
                            "charge":
                            scanFDict[scanF]['charge'],
                            "matching_tag_length":
                            indexed_item[7],
                            "time_taken":
                            0,
                            "de_novo_peptide":
                            indexed_item[8],
                            "unmod_de_novo_peptide":
                            indexed_item[9],
                            "de_novo_score":
                            indexed_item[10],
                            "num_matches":
                            indexed_item[11],
                            "obs_mh":
                            indexed_item[12],
                            "retention_time":
                            indexed_item[13],
                            "ppm": (candidate["theo_mh"] - indexed_item[12]) /
                            candidate["theo_mh"] * 1000000,
                            "fraction_id":
                            fraction_id
                        })

                        values += [candidate]
                        i += 1

                    if i > max_per_scan:
                        break

        print 'Adding %i candidates for fraction %s' % (len(values),
                                                        fraction_name)
        res = connection.execute(results.insert(), values)

    return new_scan_items
def addComboModCandidates(scanF,
                          scan_items,
                          mod_lists,
                          mod_ranges_list,
                          mod_tuples_list,
                          enumerated_mods,
                          scanFDict,
                          expand_mods,
                          hashed_mod_combos,
                          prob_network,
                          ep_step=0.01,
                          mod_tolerance=0.1,
                          ppmSTD=10):
    add_mods_map = defaultdict(set)

    # Go through entire candidate list, identify alternate combo mod interpretations for given mod ranges
    for i, item in enumerate(scan_items):
        mod_ranges = mod_ranges_list[i]

        for j, mod in enumerate(mod_lists[i]):
            # Continue if mod has already been expanded
            # Format of key in add_mods_map is (context, mod_range)
            if mod[0][0] == 'Insertion' or mod[0][0] == 'Deletion' or mod[0][
                    0] == 'Isobaric Substitution':
                continue

            # print j, mod, mod_ranges, mod_ranges[j], item[1]
            # Initialize set so that this can be skipped if it comes up in the future (and no candidates are found)
            add_mods_map[(item[1], mod_ranges[j],
                          mod_tuples_list[i][j])] = set()
            # now hash mass of mods in peptides to see if alternate combo candidates can be found
            for mod_combo_candidate in hashed_mod_combos[mod_tuples_list[i]
                                                         [j]]:

                # ModError is defined as mass of de_novo_seq - mass of modified reference_seq
                mod_error = (0 if not mod[0][2] else
                             mod[0][2]) - mod_combo_candidate[-1]
                # make sure that mod_error is within tolerance and no expanded mod for given mod interval has greater prevalence than either mod in mod combo
                if not (abs(mod_error) > mod_tolerance or any([
                        expand_mods[enum_mod] >
                        expand_mods[mod_combo_candidate[0]]
                        or expand_mods[enum_mod] >
                        expand_mods[mod_combo_candidate[1]]
                        for enum_mod in enumerated_mods[(item[1],
                                                         mod_ranges[j])]
                ])):

                    add_mods_map[(item[1], mod_ranges[j],
                                  mod_tuples_list[i][j])].add(
                                      (mod_combo_candidate, mod_error))

    #print 'Add mods', add_mods_map
    # Get Sequence candidates for mod ranges which have valid combo mods
    candidates_map = {}
    # print add_mods_map.keys()
    for context, mod_range, mod_tuple in add_mods_map:
        candidates = []
        term = getTerminus(mod_range[0], mod_range[1], context)

        for mod_combo in add_mods_map[(context, mod_range, mod_tuple)]:
            mod_1, mod_2 = mod_combo[0][:2]
            subseq = context[2:-2][mod_range[0]:mod_range[1]]
            locs_1 = getModLocs(subseq, term, mod_1)
            locs_2 = getModLocs(subseq, term, mod_2)
            # TODO: Only add mod combo with most prevalent single mods for a given (context, mod_range) combination (as opposed to all valid mod combos as is done now)?
            for loc_1 in locs_1[1]:
                for loc_2 in locs_2[1]:
                    if loc_1 < loc_2:
                        candidates += getComboModSeq(subseq, mod_1, loc_1,
                                                     mod_2, loc_2,
                                                     mod_combo[1])
                    elif loc_1 > loc_2:
                        candidates += getComboModSeq(subseq, mod_2, loc_2,
                                                     mod_1, loc_1,
                                                     mod_combo[1])
                    elif locs_1[0] != locs_2[0] and (
                            mod_1[0] == 'undefined mass shift'
                            or mod_2[0] == 'undefined mass shift'
                            or mod_1[2][1] != mod_2[2][1]):
                        # Second part of if statement guards against things like putting a trimethyl (A, N-term) with a Carbamyl (N-term, N-term) on the same residue
                        candidates += getComboModSeq(subseq, mod_1, loc_1,
                                                     mod_2, loc_2,
                                                     mod_combo[1])

        if len(candidates) > 0:
            candidates_map[(context, mod_range, mod_tuple)] = candidates

    #print candidates_map
    # Note: the way this code is written now, this method might produce LOTS of duplicate entries, particularly if the peptide in question is multiply modified
    # This is because the mod_range which has a valid combo mod is expanded for each scan item (in each scan item, the position of the mod within the mod_range may be different, but it expands out to the same set of new candidates)
    # In the future (if this way is too slow), we can minimize this time by caching the position, mod for each enumerated candidate, and only expand if the set of all mods (not including the mod_range to expand) is unique
    # For now, redundant candidates are filtered out at return step
    new_scan_items = []
    if len(candidates_map) > 0 and scanF in scanFDict:
        precMass = scanFDict[scanF]['precMass']
        epSTD = ppmSTD * precMass * 10**-6
        spec = PN.Spectrum(prob_network,
                           precMass,
                           Nmod=0.0,
                           Cmod=0.0,
                           epsilon=2 * epSTD,
                           spectrum=DataFile.getMassIntPairs(
                               scanFDict[scanF]['dta']),
                           useMemo=True)
        spec.initializeNoiseModel()

        # Generate new entries for scan from peptides
        for i, item in enumerate(scan_items):
            for j, mod_range in enumerate(mod_ranges_list[i]):
                if (item[1], mod_range,
                        mod_tuples_list[i][j]) in candidates_map:
                    new_scan_items += [
                        getComboModCandidate(spec, scanFDict[scanF]['charge'],
                                             item, mod_lists[i],
                                             mod_ranges_list[i], j,
                                             candidate[0], candidate[1],
                                             candidate[2], candidate[3],
                                             candidate[4])
                        for candidate in candidates_map[(
                            item[1], mod_range, mod_tuples_list[i][j])]
                    ]

    #print 'Num Combo Mods before getUniqueCandidates', len(new_scan_items)
    return getUniqueCandidates(new_scan_items)
Example #30
0
import DataFile
from urllib.parse import quote
from io import BytesIO
from PIL import Image
import base64
import subprocess
import demjson
import time
import Mail
import Template

url_prefix = "http://wap.sogou.com.inner/web/searchList.jsp?keyword="

get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled.txt"
word_file = "./word_top"
word_list = DataFile.read_file_into_list("./word_top")

pic_dir_prefix = "/search/odin/nginx/html/wap/tupu_garbled_pic/pic"

report_tmp_path = "mail_detail.html"
mail_to = "*****@*****.**"


def log_info(str):
    time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    sys.stdout.write('[%s] [info] %s\n' % (time_str, str))
    sys.stdout.flush()


def utf8stdout(in_str):
    utf8stdout = open(1, 'w', encoding='utf-8',
if __name__ == '__main__':
    print 'This program generates a results file containing Raw lads output postscored with the algorithm of choice. The discmodel is a supplied model, if necessary for the postscoring algorithm'
    options = ArgLib.parse(['init', 'ppmstd', 'dtadir', 'lads', 'sequest', 'config', 'model', 'output', 'symbolmap'], optArgs=[{'opts': ('-D', '--discmodel'), 'attrs': {'type': 'string', 'dest': 'discmodel', 'help': 'Model used to calculate discriminant score'}}, {'opts': ('-P', '--pairconfig'), 'attrs': {'type': 'string', 'dest': 'pairconfig', 'help': 'Name of LADS Pair Configuration'}}, {'opts': ('-F', '--featurelist'), 'attrs': {'type': 'string', 'dest': 'featurelist', 'help': 'File containing pickled list of desired features (optional)'}}])
    parent = os.path.abspath(os.pardir)
                           
    PNet = PN.ProbNetwork(options.config, options.model)
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    pairConfigurations = paramsDict['Pair Configurations']

    LADSSeqInfo = GLFD.parseSequenceDTAsLogfile(options.lads)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    seqMap = seqMap['LADS Unit Test']

    if options.featurelist:
        with open(options.featurelist) as fin:
            desired_feats = pickle.load(fin)
    else:
        desired_feats = None

    heavySeqMaps = {}
    for confName in pairConfigurations:
        heavySeqMaps[confName] = copy.deepcopy(seqMap)
        heavySeqMaps[confName]['Mods']['N-Term'] = pairConfigurations[confName]['NModSymbol']
        heavySeqMaps[confName]['Mods']['C-Term'] = pairConfigurations[confName]['CModSymbol']

    if options.pairconfig:
        binsDict[bin][2] = binsDict[bin][0] - binsDict[bin][1]

    outFile.write('\n%s Scan Number Difference Distribution. Max Diff: %i' % (name, maxDiff) + '\n')
    outFile.write('\t'.join(['Diff Bin', 'Test Pairs', 'True Pairs', 'False Pairs']) + '\n')
    for i in range(numBins):
        outFile.write('\t'.join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + '\n')
    
    
if __name__ == '__main__':
    print 'Model refers to svmmodel used'
    options = ArgLib.parse(['dtadir', 'combined', 'sequest', 'mascot', 'database', 'output', 'ppmstd', 'init', 'symbolmap'])
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(An.searchprogs, options)

    dbDict = DataFile.getDBInfo(options.database)
    infoMap = dbDict['infoMap']
    
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    
    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)

    processedInfo = {} 
    
    if options.mascot:
        MASCOTdict = eval(options.mascot)
        processedInfo.update(CS.parseScans(MASCOTdict, 'MASCOT', seqMap, dbDict))     
    if options.sequest:
        SEQUESTdict = eval(options.sequest)
        processedInfo.update(CS.parseScans(SEQUESTdict, 'SEQUEST', seqMap, dbDict))
Example #33
0
    combMasses = []
    for cluster in clusters:
        if len(cluster) > 1:
            combMasses += [sum(cluster) / len(cluster)]
        else:
            combMasses += cluster
    
    return np.sort(np.array(combMasses))
    
if __name__ == '__main__':
    dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\'
    
    ppm = 5
    heavyPath = dirPath + '244.3611.3611.1.dta'
    lightPath = dirPath + '244.3619.3619.1.dta'
    heavyPairs = DataFile.getMassIntPairs(heavyPath)
    lightPairs = DataFile.getMassIntPairs(lightPath)
    heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath) 
    lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath)   
    
    print ppm * 10 ** -6 * heavyPrecMass
    print getSharedPeaksRatio(lightPairs, heavyPairs, Nmod=0, Cmod=Constants.mods['*'], epsilon=ppm * heavyPrecMass * 10 ** -6)
    
     
    """
    tPath = dirPath + '244.0855.0855.1.dta'
    tMass = DataFile.getPrecMassAndCharge(tPath)[0] 
    tPairs = DataFile.getMassIntPairs(tPath)
    tIons = tPairs[:,0]
    tIons = np.insert(tIons, 0, 0)
    tIons = np.append(tIons, tMass)
            "ppmpenalty",
            "ambigpenalty",
            "minedge",
            "maxedge",
            "alpha",
            "subgraphcut",
            "symbolmap",
        ]
    )
    epStep = 0.00025
    maxEp = 0.1

    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, "r") as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({"LADS Unit Test": "LADS"}, symbolMap, paramsDict)

    if options.columns:
        with open(options.columns) as fin:
            cols = pickle.load(fin)
    else:
        print "Using default cols"
        cols = [
            "light scan",
            "heavy scan",
            "pair configuration",
            "M+H",
            "score",
            "seq",
            "epsilon",
            "ambiguous edges",
Example #35
0
    outFile.write("\t".join(["Diff Bin", "Test Pairs", "True Pairs", "False Pairs"]) + "\n")
    for i in range(numBins):
        outFile.write(
            "\t".join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + "\n"
        )


if __name__ == "__main__":
    print "Model refers to svmmodel used"
    options = ArgLib.parse(
        ["dtadir", "combined", "sequest", "mascot", "database", "output", "ppmstd", "init", "symbolmap"]
    )

    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(An.searchprogs, options)
    dbDict = DataFile.getDBInfo(options.database)
    with open(options.symbolmap, "r") as fin:
        symbolMap = pickle.load(fin)

    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)
    outFile = open(options.output, "w")

    print options.dtadir
    dtaList = glob.glob(options.dtadir + "/*.dta")
    scanFDict = getScanFDict(dtaList)

    processedInfo = {}

    if options.mascot:
        MASCOTdict = eval(options.mascot)
        processedInfo.update(CS.parseScans(MASCOTdict, "MASCOT", seqMap, dbDict))
Example #36
0
 def loadInit(self):
     self._paramsDict = DataFile.parseParams(self._selectedInitFile.get())
     with open('../Misc/symbolmap.txt', 'r') as fin:
         symbolMap = pickle.load(fin)
     self._seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, self._paramsDict)['LADS Unit Test']
     self._aas = Constants.addPepsToAADict(self._minedge)
import DataFile
import Template
import Mail
import random

top_word_loc = "http://$ip/vr_query_period/vr_query_pv.txt"
random_word_loc = "http://$ip/vr_query_period/vr_query_random.txt"

top_word_file = "./word_top"
random_word_file = "./word_random"

url_file = "./url_lizhi"
url_prefix = "https://wap.sogou.com/web/searchList.jsp?keyword="

#mail_lst = ['*****@*****.**']
mail_lst = DataFile.read_file_into_list("./mail_list")
report_tmp_path = "mail_detail.html"


def get_word(url, word_file):

    try:
        res = requests.get(url)
        res.encoding = "utf-8"
        with open(word_file, 'w', encoding='utf8') as f:
            f.write(res.text)
    except Exception as err:
        print('[get_word]: %s' % err)


def gen_url(f_in, f_out):
Example #38
0
        '''
        mzMLFiles  = glob.glob(dataDir + '/' + '*f%02d.mzML'%options.fraction)
        mzXMLFiles = glob.glob(dataDir + '/' + '*f%02d.mzXML'%options.fraction)

        if len(mzMLFiles) > 0:
            ## Found mzML file for this fraction
            fileFound = True
            mzMLFile = os.path.abspath(mzMLFiles[0])
            
            ## The directory for DTA files will be the same as the mzML file without the .mzML extension
            mzml_name_base = mzMLFile[:-5]
            print 'mzMLFile: "%s"' % (mzMLFile)
            print 'mzml_name_base: "%s"' % (mzml_name_base)
        
            # Unpack DTAs
            DataFile.executeProcess(SCRIPTS_DIR, 'mzml2dta.py', ['-o', mzml_name_base, mzMLFile])
            ''' Replace os.path.sep with '/' to fix Windows backslash issues. --smp
            dtaDir = glob.glob(dataDir + os.path.sep + '*f%02d'%options.fraction)[0] + os.path.sep
            '''
            localDtaDir = glob.glob(dataDir + '/' + '*f%02d'%options.fraction)[0] + '/'
            print 'Found mzML, setting dtaDir to %s' % (localDtaDir)
        elif len(mzXMLFiles) > 0:
            ## Found mzXML file for this fraction
            fileFound = True
            mzXMLFile = os.path.abspath(mzXMLFiles[0])
            
            '''
            ## The directory for DTA files will be the same as the mzML file without the .mzML extension
            mzml_name_base = mzMLFile[:-5]
            ## print 'mzml_name_base: "%s"' % (mzml_name_base)
            '''
Example #39
0
import time
import os, sys
from bs4 import BeautifulSoup
import DataFile
from urllib.parse import quote
import time
import Mail
import Template
from ast import literal_eval
from itertools import chain

MIN_TEXT_LENGTH = 5
url_prefix = "http://wap.sogou.com.inner/web/searchList.jsp?keyword="
get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled_lizhi.txt"
word_file = "./word_lizhiqa"
word_list = DataFile.read_file_into_list("./word_lizhiqa")
report_tmp_path = "mail_detail.html"
mail_to = "*****@*****.**"
result_file = 'lizhiqa_garbled_result'
f_res = open(result_file, 'w', encoding='utf8')


class Node(object):
    def __init__(self, query, vrid):
        self.query = query
        self.vrid = vrid
        self.url = ""
        self.html = ""
        self.qa_text = ""
        self.garble = False
        self.garble_res = ""
Example #40
0
#!/usr/bin/python3
# -*-codig=utf8-*-

from pyppeteer import launch
import asyncio
from urllib.parse import quote
import DataFile
import datetime
import os
import time
import random

wordlist = DataFile.read_file_into_list("./vr_1")


async def action_get_page_content(page):
    content = await page.evaluate('document.documentElement.outerHTML',
                                  force_expr=True)
    return content


async def action_is_element_exist(page, selector):
    #返回文档中与指定选择器或选择器组匹配的第一个 html元素Element。 如果找不到匹配项,则返回null
    el = await page.querySelector(selector)
    return el


async def action_get_result_loc(page):
    result_loc_list = await page.evaluate('''() => {
                        //classid黑名单,50000000占位符,50023801/50023901/50024901相关推荐,30010081中部hint,21312001关系图谱,11005401搜狗问问提问
	                    classidBlackArr = ['50000000','50023801','50023901','30010081','21312001','11005401', '50024901'];
Example #41
0
            if LCs[i] < cutOff:
                procSeq[i] = ambigAA
                ambig_edges += [(0, Constants.aminoacids[aa][2])]
        return ''.join(procSeq), ambig_edges


if __name__ == '__main__':
    print 'In this program, the PEAKS argument is just the location of the PEAKS output to parse. Number argument indicates ALC cutoff to form ambig edges (set to 0 to not form any amibiguous edges'
    options = ArgLib.parse(['init', 'output', 'symbolmap', 'peaks', 'cutoff'])

    AMBIG_AA = '@'

    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'PEAKS': 'PEAKS'}, symbolMap, paramsDict)
    #print seqMap

    scanInfo = DataFile.getScanInfo(options.peaks, delimiter=',')[1]
    if 'Peptide' in scanInfo[0]:
        seq_col = 'Peptide'
    else:
        seq_col = 'Sequence'

    outFile = open(options.output, 'w')

    #print 'cutoff_arge', options.cutoff
    cols = ['ScanF', 'Charge', 'RT', 'Obs M+H', 'Peptide', 'ALC (%)', 'LC']
    alc_cutoff = options.cutoff if options.cutoff else 0
    if alc_cutoff > 0:
        cols.insert(-2, 'Ambig Edges')
Example #42
0
        return False

def parseDBScans(fDict, prog, seqMap, dbDict):
    processedInfo = {}
    for csvfile in fDict.keys():
        MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'], delimiter=',')
        processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'])
    
    return processedInfo

#Number argument refers to minimum number of search prog results which have the same peptide for it to be included in the final output
if __name__== '__main__':
    options = ArgLib.parse(['init', 'sequest', 'lads', 'mascot', 'output', 'database', 'symbolmap', 'number'])
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    dbDict = DataFile.getDBInfo(options.database)
    progDict = ArgLib.getProgDict(An.searchprogs, options)
    
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)
    
    if hasattr(options, 'number'):
        minNumScans = int(options.number)
    else:
        minNumScans = 1
        
    processedInfo = {}  
    if options.lads:
        LADSdict = eval(options.lads)
        for tdvfile in LADSdict.keys():
Example #43
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        sys.exit(2)
    for opt, arg in opts:
        print 'opt: %s' % (opt)
        print 'arg: %s' % (arg)
        if opt in ("-h", "--help"):
            printUsage()
            sys.exit(1)
    ''' Now process the arguments (INI file path)'''
    if len(args) != 1:
        printUsage()
        sys.exit(1)
    configFileName = args[0]
    ### create a output file/handle:
    tmpFolder = tempfile.gettempdir()
    (tm_year, tm_mon, tm_mday, tm_hour, tm_min, tm_sec, tm_wday, tm_yday,
     tm_isdst) = time.localtime(time.time())
    runCapture = tmpFolder + '/RunTG' + str(tm_mon) + str(tm_mday) + str(
        tm_hour) + str(tm_min) + str(tm_sec) + '.txt'
    fh = open(runCapture, 'w')
    write2FileStdout(
        fh, '**** start TagGraph process: %s' % (datetime.datetime.now()))
    write2FileStdout(fh, TAGGRAPH_CONFIG_HEADER)
    write2FileStdout(fh, configFileName + "\n")
    if os.path.isfile(configFileName) and os.access(configFileName, os.R_OK):
        write2FileStdout(fh, MSGBORDER)
        write2FileStdout(fh, "Using Configuration File: %s" % configFileName)
        write2FileStdout(fh, MSGBORDER)
    else:
        #print ' ** FAILURE ** Could not read configuration file: \'%s\'' % (configFileName)
        write2FileStdout(
            fh, ' ** FAILURE ** Could not read configuration file: \'%s\'' %
            (configFileName))
        sys.exit(1)
    theConfig = ConfigParser.ConfigParser()
    theConfig.optionxform = str
    theConfig.read(configFileName)
    #sectionNames = theConfig.sections()
    generalSectionMap = ConfigSectionMap(theConfig, "General")
    tagGraphSectionMap = ConfigSectionMap(theConfig, "TagGraph")
    ######    init', 'dtadir', 'peaks', 'output', 'ppmstd', 'modtolerance', 'unimoddict', 'maxcounts', 'modmaxcounts', 'fmindex', 'model', 'config'
    ## Define our Required Arguments ##
    fatalError = False
    ## Arguments that must exist, and be numbers ##
    requiredTagGraphNumericArgs = [
        'ppmstd', 'modtolerance', 'maxcounts', 'modmaxcounts'
    ]
    ## Arguments that must exist, and be paths that point to files that exist and are Readable ##
    requiredTagGraphExistingFiles = [
        'unimoddict', 'model', 'config', 'init', 'de_novo'
    ]
    ## Arguments that must exist, and be directories that can be created on the filesystem ##
    requiredTagGraphToCreate = ['output']
    ## Special Arguments:
    # ExperimentName must be a string
    # d must be a directory, with mzML/mzXML files in it that start with ExperimentName
    # f must be an fmindex name of the form <basepath>.fm, where <basepath> is the basename and the following files should exist: <basename>.fm.1, <basename>.seqnames.1, <basename>.offsets
    ## Arguments that must exist, and be numbers ##
    for currArg in requiredTagGraphNumericArgs:
        if currArg in tagGraphSectionMap:
            if isNumeric(tagGraphSectionMap[currArg]):
                write2FileStdout(
                    fh,
                    '* Found Required Numeric TagGraph Parameter \'%s\'  : \'%s\''
                    % (currArg, tagGraphSectionMap[currArg]))
            else:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required TagGraph Parameter \'%s\' must be a numeric value, found value \'%s\''
                    % (currArg, tagGraphSectionMap[currArg]))
        else:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file'
                % (currArg))
    ## Arguments that must exist, and be paths that point to files that exist and are Readable ##
    for currArg in requiredTagGraphExistingFiles:
        if currArg in tagGraphSectionMap:
            if os.path.isfile(tagGraphSectionMap[currArg]) and os.access(
                    tagGraphSectionMap[currArg], os.R_OK):
                write2FileStdout(
                    fh,
                    '* Found Required Readable File for TagGraph Parameter \'%s\' : \'%s\''
                    % (currArg, tagGraphSectionMap[currArg]))
            else:
                if not os.path.isfile(tagGraphSectionMap[currArg]):
                    fatalError = True
                    write2FileStdout(
                        fh,
                        '** FAILURE ** Could not find file for Required Parameter \'%s\' at \'%s\''
                        % (currArg, tagGraphSectionMap[currArg]))
                elif not os.access(tagGraphSectionMap[currArg], os.R_OK):
                    fatalError = True
                    write2FileStdout(
                        fh,
                        '** FAILURE ** Could not Read file for Required Parameter \'%s\' at \'%s\' (check permissions)'
                        % (currArg, tagGraphSectionMap[currArg]))
        else:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file'
                % (currArg))
    ## Arguments that must exist, and be directories that should not already exist but can be created on the filesystem ##
    for currArg in requiredTagGraphToCreate:
        if currArg in tagGraphSectionMap:
            dirToCreate = tagGraphSectionMap[currArg]
            if not os.path.exists(dirToCreate):
                try:
                    ## Should be able to make the directory, and then remove it ##
                    os.makedirs(dirToCreate)
                    os.rmdir(dirToCreate)
                    write2FileStdout(
                        fh,
                        '* Found Required Createable Directory for TagGraph Parameter \'%s\' : \'%s\''
                        % (currArg, dirToCreate))
                except OSError:
                    fatalError = True
                    write2FileStdout(
                        fh,
                        '** FAILURE ** Unable to Create Directory for Required Parameter \'%s\' at \'%s\''
                        % (currArg, dirToCreate))
            else:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** File/Directory for Required Parameter \'%s\' at \'%s\' already exists! Should be created by TagGraph'
                    % (currArg, dirToCreate))
        else:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file'
                % (currArg))
    ## Now Lets Handle the Special Cases
    ## ExperimentName must be a string
    experimentName = ''
    if not 'ExperimentName' in tagGraphSectionMap:
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required TagGraph Parameter \'ExperimentName\' not found in config file'
        )
    else:
        experimentName = tagGraphSectionMap['ExperimentName']
        write2FileStdout(
            fh, '* Found Required TagGraph Parameter ExperimentName: \'%s\'' %
            (experimentName))

    ## New Method: numFractions = 2, fraction01 = <path to file 1>, fraction02 = <path to file 2>
    numFractions = 0
    foundNumFractions = False
    dataDirectory = ''
    symLinkDir = symLinkBaseDir
    if not 'numFractions' in tagGraphSectionMap:
        ## Check for dataDirectory and automatically finding data files from the de novo files
        if not 'dataDirectory' in tagGraphSectionMap:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' not found in config file'
            )
        else:
            dataDirectory = tagGraphSectionMap['dataDirectory']
            if not dataDirectory.endswith('/'):
                dataDirectory += '/'
            if (not (dataDirectory.startswith('/'))):
                levelup = dataDirectory.count('../')
                if (levelup == 0):
                    dataDirectory = CUR_DIR + '/' + dataDirectory
                else:
                    splitDataDir = dataDirectory.split("/")
                    splitCurDir = CUR_DIR.split("/")
                    tmpD = ''
                    for i in xrange(0, len(splitCurDir) - levelup):
                        tmpD = tmpD + splitCurDir[i] + "/"
                    for i in xrange(levelup, len(splitDataDir) - 1):
                        tmpD = tmpD + splitDataDir[i] + "/"
                    dataDirectory = tmpD
            write2FileStdout(fh, "dataDirectory: %s" % dataDirectory)
            if not os.path.exists(dataDirectory):
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' does not exist at: \'%s\''
                    % (dataDirectory))
            elif not os.path.isdir(dataDirectory):
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' does not point to a directory at: \'%s\''
                    % (dataDirectory))
            else:
                ## We need to get the data file names from the de novo file, and check for them in the dataDirectory
                fileFractionMapping = []
                deNovoFile = tagGraphSectionMap['de_novo']
                if deNovoFile.upper().endswith('.XML') or deNovoFile.upper(
                ).endswith('.PEPXML') or deNovoFile.upper().endswith('.CSV'):
                    if deNovoFile.upper().endswith(
                            '.XML') or deNovoFile.upper().endswith('.PEPXML'):
                        fileFractionMapping = pepInput.getFileFractionMappingFromPepXML(
                            deNovoFile)
                    else:  ## deNovoFile.upper().endswith('.CSV'):
                        fileFractionMapping = pepInput.getFileFractionMappingFromCSV(
                            deNovoFile)
                    ## We should now have fileMapping, a list of tuples: (2-Digit Fraction Num, FileName)
                    ## mz[X]ML Files should be located in the dataDirectory
                    write2FileStdout(
                        fh, 'fileFractionMapping: %s' % fileFractionMapping)
                    symLinkDir += experimentName + '_' + str(os.getpid()) + '/'
                    dataFileSuffix = "mzML"
                    try:
                        ## Should be able to make the directory, and then remove it ##
                        os.makedirs(symLinkDir)
                        write2FileStdout(
                            fh,
                            '* Created temporary sym-link Directory for TagGraph mz[X]ML files \'%s\''
                            % (symLinkDir))
                        ## Lets write out the fileFractionMapping, pickled for easy reading/writing
                        mappingFilename = 'fileFractionMapping.pck'
                        mappingFilePath = os.path.join(symLinkDir,
                                                       mappingFilename)
                        mappingOutput = open(mappingFilePath, 'wb')
                        pickle.dump(fileFractionMapping, mappingOutput)
                        mappingOutput.close()
                        ##Create a symbolic link pointing to source named link_name.
                        for currFilledFractionNumber, currFilename in fileFractionMapping:
                            ## Check if source file exists
                            currFilePath = dataDirectory + currFilename
                            if not os.path.exists(currFilePath):
                                fatalError = True
                                write2FileStdout(
                                    fh,
                                    '** FAILURE ** Data File \'%s\' referenced in de novo file does not exist in dataDirectory \'%s\''
                                    % (currFilename, dataDirectory))
                            elif not os.access(currFilePath, os.R_OK):
                                fatalError = True
                                write2FileStdout(
                                    fh,
                                    '** FAILURE ** Data file \'%s\' Not Readable'
                                    % (currFilePath))
                            else:
                                currFractionFile = currFilePath
                                if currFractionFile.endswith('mzML'):
                                    dataFileSuffix = 'mzML'
                                elif currFractionFile.endswith('mzXML'):
                                    dataFileSuffix = 'mzXML'
                                else:
                                    fatalError = True
                                    dataFileSuffix = ''
                                    write2FileStdout(
                                        fh,
                                        '** FAILURE ** Data file \'%s\' must end in .mzML or .mzXML!'
                                        % (currFractionFile))
                                if not dataFileSuffix == '':
                                    symLinkFile = symLinkDir + experimentName + '_f' + currFilledFractionNumber + '.' + dataFileSuffix
                                    os.symlink(currFractionFile, symLinkFile)
                                    write2FileStdout(
                                        fh,
                                        '   * Created symLink \'%s\' to data file \'%s\''
                                        % (symLinkFile, currFractionFile))
                    except OSError:
                        fatalError = True
                        write2FileStdout(
                            fh,
                            '** FAILURE ** Unable to Create Directory for TagGraph mz[X]ML sym-links at \'%s\''
                            % (symLinkDir))
                else:
                    fatalError = True
                    write2FileStdout(
                        fh,
                        '** FAILURE ** Required de novo TagGraph Parameter \'de_novo\' must be named .CSV or .XML/.PEPXML, found \'%s\''
                        % (deNovoFile))
    else:
        numFractions = tagGraphSectionMap['numFractions']
        if isNumeric(numFractions):
            if float(numFractions).is_integer():
                foundNumFractions = True
                write2FileStdout(
                    fh,
                    '* Found Required integer TagGraph Parameter \'numFractions\'  : \'%s\''
                    % (numFractions))
                numFractions = int(numFractions)
            else:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required TagGraph Parameter \'numFractions\' must be an integer value, found value \'%s\''
                    % (numFractions))
        else:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required TagGraph Parameter \'numFractions\' must be a numeric value, found value \'%s\''
                % (numFractions))
    ## If we found numFractions, lets get the paths to the data files and make sym-links to them in a new directory ##
    ## sym-links will be named <ExperimentName>_f01.mz[X]ML, etc.                                                   ##
    if True == foundNumFractions:
        symLinkDir += experimentName + '_' + str(os.getpid()) + '/'
        dataFileSuffix = "mzML"
        try:
            ## Should be able to make the directory, and then remove it ##
            os.makedirs(symLinkDir)
            write2FileStdout(
                fh,
                '* Created temporary sym-link Directory for TagGraph mz[X]ML files \'%s\''
                % (symLinkDir))
        except OSError:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Unable to Create Directory for TagGraph mz[X]ML sym-links at \'%s\''
                % (symLinkDir))
        ##Create a symbolic link pointing to source named link_name.
        for currFraction in xrange(1, numFractions + 1):
            filledFractionNumber = str(currFraction).zfill(2)
            if not str('fraction' +
                       filledFractionNumber) in tagGraphSectionMap:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required TagGraph Parameter \'fraction%s\' not found in config file'
                    % (filledFractionNumber))
            currFractionFile = tagGraphSectionMap['fraction' +
                                                  filledFractionNumber]
            if currFractionFile.endswith('mzML'):
                dataFileSuffix = 'mzML'
            elif currFractionFile.endswith('mzXML'):
                dataFileSuffix = 'mzXML'
            else:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Data file \'%s\' must end in mzML or mzXML!'
                    % (currFractionFile))
            symLinkFile = symLinkDir + experimentName + '_f' + filledFractionNumber + '.' + dataFileSuffix
            os.symlink(currFractionFile, symLinkFile)
            write2FileStdout(
                fh, '   * Created symLink \'%s\' to data file \'%s\'' %
                (symLinkFile, currFractionFile))
    # f must be an fmindex name of the form <basepath>.fm, where <basepath> is the full file path without the .fm extension, and the following files should exist: <basename>.fm.1, <basename>.seqnames.1, <basename>.offsets
    fmindexBase = ''
    if not 'fmindex' in tagGraphSectionMap:
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required TagGraph Parameter \'fmindex\' (should be the basename of the fmindex files, ending in \'.fm\') not found in config file'
        )
    else:
        fmParam = tagGraphSectionMap['fmindex']
        write2FileStdout(
            fh,
            '* Found Required fmindex TagGraph Parameter \'%s\'' % (fmParam))
        if fmParam.endswith('.fm'):
            fmindexBase = fmParam[:-3]
        else:
            fmindexBase = fmParam
        # Now lets check for 3 fmIndex files ending in: .fm.1, .offsets, and .seqnames.1
        fmFile = fmindexBase + ".fm.1"
        fmOffsetFile = fmindexBase + ".offsets"
        fmSeqnamesFile = fmindexBase + ".seqnames.1"
        if not os.path.isfile(fmFile):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not find required fmindex file at \'%s\''
                % (fmFile))
        elif not os.access(fmFile, os.R_OK):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not Read required fmindex file \'%s\' (check permissions)'
                % (fmFile))
        else:
            write2FileStdout(
                fh, '   * Found Required readable fmindex file at \'%s\'' %
                (fmFile))
        if not os.path.isfile(fmOffsetFile):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not find required fmindex Offset file at \'%s\''
                % (fmOffsetFile))
        elif not os.access(fmOffsetFile, os.R_OK):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not Read required fmindex Offset file \'%s\' (check permissions)'
                % (fmOffsetFile))
        else:
            write2FileStdout(
                fh,
                '   * Found Required readable fmindex Offset file at \'%s\'' %
                (fmOffsetFile))
        if not os.path.isfile(fmSeqnamesFile):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not find required fmindex Seqnames file at \'%s\''
                % (fmSeqnamesFile))
        elif not os.access(fmSeqnamesFile, os.R_OK):
            fatalError = True
            write2FileStdout(
                fh,
                '    ** FAILURE ** Could not Read required fmindex Seqnames file \'%s\' (check permissions)'
                % (fmSeqnamesFile))
        else:
            write2FileStdout(
                fh,
                '   * Found Required readable fmindex Seqnames file at \'%s\''
                % (fmSeqnamesFile))
    ### Now lets Check the EM step parameters that can be checked before TG runs ###
    expectationMaximizationSectionMap = ConfigSectionMap(theConfig, "EM")
    '''
    -i: same as TG -i parameter
    -F all
    -M 100
    -C 20
    -B = <-o parameter from TG>/results.db [checked after TG]
    -E: Same as TG ExperimentName parameter.
    -o: Output Prefix, will create files with the prefix <EM -o parameter> in the directory specified by the <TG -o parameter>
    '''
    ## Arguments that must exist, and be numbers
    # Special Case: EMFractions must be 'all' or a number. Note: EMFractions is now assumed to always be 'all'
    requiredEMNumericArgs = ['maxIterations',
                             'initIterations']  #,'EMFractions']
    ## Special Arguments:
    ## -o must be a string, the file prefix for the EM Output files (often 'EM_Results')
    ## Arguments that must exist, and be numbers ('EMFractions' is special, as a number or 'all')
    for currArg in requiredEMNumericArgs:
        if currArg in expectationMaximizationSectionMap:
            if isNumeric(expectationMaximizationSectionMap[currArg]):
                write2FileStdout(
                    fh,
                    '* Found Required EM Numeric Parameter \'%s\'  : \'%s\'' %
                    (currArg, expectationMaximizationSectionMap[currArg]))
            else:
                fatalError = True
                write2FileStdout(
                    fh,
                    '** FAILURE ** Required EM Parameter \'%s\' must be a numeric value, found value \'%s\''
                    % (currArg, expectationMaximizationSectionMap[currArg]))
        else:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required EM Parameter \'%s\' not found in config file'
                % (currArg))
    ## Now Lets Handle the Special Cases
    # resultsPrefix (Output Prefix) must be a string
    emResultsPrefix = ''
    if not 'resultsPrefix' in expectationMaximizationSectionMap:
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required EM Parameter \'resultsPrefix\' not found in config file'
        )
    else:
        emResultsPrefix = expectationMaximizationSectionMap['resultsPrefix']
        write2FileStdout(
            fh, '* Found Required EM Parameter \'resultsPrefix\': \'%s\'' %
            (emResultsPrefix))
    #options = ArgLib.parse(['init', 'dtadir', 'peaks', 'output', 'ppmstd', 'modtolerance', 'unimoddict', 'maxcounts', 'modmaxcounts', 'fmindex', 'model', 'config'], optArgs=[{'opts': ('-x', '--splittaxon'), 'attrs': {'dest': 'splittaxon', 'action': 'store_true', 'default': False, 'help': 'Flag. For searches of metaproteomic databases, split identical context entries by taxon for accurate consideration via EM.'}}])
    ### If a fatal error was thrown, do not proceed ###
    if fatalError == True:
        write2FileStdout(
            fh,
            '*****  HALTING DUE TO FATAL ERROR IN TAGGRAPH OR EM PARAMETERS, SEE OUTPUT ABOVE!!! '
        )
        sys.exit(1)
    ## Lets set up the args properly for RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py ##
    tg_ppmstd = str(tagGraphSectionMap['ppmstd'])
    tg_modtolerance = str(tagGraphSectionMap['modtolerance'])
    tg_maxcounts = str(tagGraphSectionMap['maxcounts'])
    tg_modmaxcounts = str(tagGraphSectionMap['modmaxcounts'])
    tg_config = tagGraphSectionMap['config']
    tg_init = tagGraphSectionMap['init']
    tg_dtadir = symLinkDir  ## tagGraphSectionMap['d']
    tg_model = tagGraphSectionMap['model']
    tg_output = tagGraphSectionMap['output']
    tg_unimoddict = tagGraphSectionMap['unimoddict']
    tg_fmindex = tagGraphSectionMap['fmindex']
    tg_peaks = '{\'' + tagGraphSectionMap[
        'ExperimentName'] + '\': \'' + tagGraphSectionMap[
            'de_novo'] + '\'}'  # K = "{'e009133': '/lab/samba/shared/Users/Sam/20160630_Pulldown_dcas9_in_gel_digest_test_DENOVO_5/de_novo_peptides.csv'}"
    ### tg_output directory will now end with a slash
    if not tg_output.endswith('/'):
        tg_output += '/'
    tgArgs = []
    tgArgs.extend(['-p', '\"' + tg_ppmstd + '\"'])
    tgArgs.extend(['-l', '\"' + tg_modtolerance + '\"'])
    tgArgs.extend(['-M', '\"' + tg_maxcounts + '\"'])
    tgArgs.extend(['-C', '\"' + tg_modmaxcounts + '\"'])
    tgArgs.extend(['-c', '\"' + tg_config + '\"'])
    tgArgs.extend(['-i', '\"' + tg_init + '\"'])
    tgArgs.extend(['-d', '\"' + tg_dtadir + '\"'])
    tgArgs.extend(['-m', '\"' + tg_model + '\"'])
    tgArgs.extend(['-o', '\"' + tg_output + '\"'])
    tgArgs.extend(['-Q', '\"' + tg_unimoddict + '\"'])
    tgArgs.extend(['-f', '\"' + tg_fmindex + '\"'])
    tgArgs.extend(['-K', '\"' + tg_peaks + '\"'])
    write2FileStdout(fh, '\nTG ARGS: %s\n\n' % tgArgs)
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(
        fh, '*** CALLING RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py from runTG.py')
    write2FileStdout(fh, MSGBORDER + "\n")
    DataFile.executeProcess(SCRIPTS_DIR, 'RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py',
                            tgArgs)
    write2FileStdout(fh, "\n" + MSGBORDER)
    write2FileStdout(
        fh,
        '*** END CALLING RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py from runTG.py')
    write2FileStdout(fh, MSGBORDER)
    ### VERIFY TG RUN ###
    '''
    Now lets check the TG output to make sure it ran correctly. We'll check for:
    * <output_dir>/results.db should exist and have size > 0 (do actual db check?)
    * The files <output_dir>/<experiment_name>_addPlausibleMods_poss_[combo/single]_mods.tdv both exist and have reasonable sizes
    * Check that output_dir/<experiment_name>/data/ contains directories of DTA files named <experiment_name>_f01/ etc
    * Check that output_dir/<experiment_name>/de_novo/<experiment_name>_PEAKS.csv/PEAKS_parsed.tdv/PEAKS_parsed_F1.tdv etc exist
    * Check that output_dir/<experiment_name>/taggraph/<experiment_name>_PEAKS_parsed_F1_TAGGRAPH.tdv etc exist
    * output_dir/<experiment_name>_CHECK.txt.<numFractions> contains count numbers for each fraction:
    -------------------------------
    /lab/samba/shared/Users/Sam/newtest/diet60_output
    Experiment Name diet60 ID 1
    Result Counts for 4 fractions
    F1: 399878
    F2: 395964
    F3: 346932
    F4: 270693
    -------------------------------
    '''
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, '*** VERIFYING TAGGRAPH OUTPUTS in runTG.py ')
    write2FileStdout(fh, MSGBORDER)
    minDBFileSize = 1000000  ## 1Megabyte minimum db size after TG runs?
    minAddPlausibleModsFileSize = 2000  ## 10kBytes min size for <experiment_name>_addPlausibleMods_[combo/single]_mods.tdv files
    ## <output_dir>/results.db should exist and have size > 0 (do actual db check?)
    dbFile = tg_output + 'results.db'
    if not os.path.exists(dbFile):
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required SQLITE DB File \'%s\' does not exist!!' %
            (dbFile))
    else:
        dbFileSize = os.path.getsize(dbFile)
        if dbFileSize < minDBFileSize:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required SQLITE DB File \'%s\' is too small: %d Bytes!!'
                % (dbFile, dbFileSize))
        else:
            write2FileStdout(
                fh,
                '* Found Required SQLITE DB File \'%s\', size %d Bytes OK' %
                (dbFile, dbFileSize))
    ## The files <output_dir>/<experiment_name>_addPlausibleMods_poss_[combo/single]_mods.tdv both exist
    singleModsFile = tg_output + experimentName + '_addPlausibleMods_poss_single_mods.tdv'
    comboModsFile = tg_output + experimentName + '_addPlausibleMods_poss_combo_mods.tdv'
    if not os.path.exists(singleModsFile):
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required Single Mods File \'%s\' does not exist!!' %
            (singleModsFile))
    else:
        singleModsFileSize = os.path.getsize(singleModsFile)
        if singleModsFileSize < minAddPlausibleModsFileSize:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required Single Mods File \'%s\' is too small: %d Bytes!!'
                % (singleModsFile, singleModsFileSize))
        else:
            write2FileStdout(
                fh,
                '* Found Required Single Mods File \'%s\', size %d Bytes OK' %
                (singleModsFile, singleModsFileSize))
    if not os.path.exists(comboModsFile):
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Required Combo Mods File \'%s\' does not exist!!' %
            (comboModsFile))
    else:
        comboModsFileSize = os.path.getsize(comboModsFile)
        if comboModsFileSize < minAddPlausibleModsFileSize:
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Required Combo Mods File \'%s\' is too small: %d Bytes!!'
                % (comboModsFile, comboModsFileSize))
        else:
            write2FileStdout(
                fh,
                '* Found Required Combo Mods File \'%s\', size %d Bytes OK' %
                (comboModsFile, comboModsFileSize))
    ## Check that output_dir/<experiment_name>/data/ contains directories of DTA files named <experiment_name>_f01/ etc
    dataDir = tg_output + experimentName + '/data/'
    for currFraction in xrange(1, numFractions + 1):
        filledFractionNumber = str(currFraction).zfill(2)
        currDtaDirName = dataDir + experimentName + '_f' + filledFractionNumber
        if not os.path.exists(currDtaDirName):
            fatalError = True
            write2FileStdout(
                fh, '** FAILURE ** Missing directory of DTA files at: \'%s\'' %
                (currDtaDirName))
        elif not os.path.isdir(currDtaDirName):
            fatalError = True
            write2FileStdout(
                fh, '** FAILURE ** \'%s\' exists but is not a Directory!' %
                (currDtaDirName))
        else:
            write2FileStdout(
                fh, '* Found DTA directory: \'%s\'' % (currDtaDirName))
    ## Check that output_dir/<experiment_name>/de_novo/<experiment_name>_PEAKS.csv/PEAKS_parsed.tdv/PEAKS_parsed_F1.tdv etc exist
    deNovoDir = tg_output + experimentName + '/de_novo/'
    deNovoCSV = deNovoDir + experimentName + '_PEAKS.csv'
    peaksParsed = deNovoDir + experimentName + '_PEAKS_parsed.tdv'
    fractionsParsedBase = deNovoDir + experimentName + '_PEAKS_parsed_F'
    if not os.path.exists(deNovoCSV):
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Missing de novo CSV File \'%s\' !!' % (deNovoCSV))
    else:
        write2FileStdout(
            fh, '* Found Required de novo CSV File \'%s\'' % (deNovoCSV))
    if not os.path.exists(peaksParsed):
        fatalError = True
        write2FileStdout(
            fh, '** FAILURE ** Missing Parsed de novo File \'%s\' !!' %
            (peaksParsed))
    else:
        write2FileStdout(
            fh, '* Found Required Parsed de novo File \'%s\'' % (peaksParsed))
    for currFraction in xrange(1, numFractions + 1):
        currParsedFractionFile = fractionsParsedBase + str(
            currFraction) + '.tdv'
        if not os.path.exists(currParsedFractionFile):
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Missing Parsed de novo Fraction File \'%s\' !!'
                % (currParsedFractionFile))
        else:
            write2FileStdout(
                fh, '* Found Required Parsed de novo Fraction File \'%s\'' %
                (currParsedFractionFile))
    ## Check that output_dir/<experiment_name>/taggraph/<experiment_name>_PEAKS_parsed_F1_TAGGRAPH.tdv etc exist
    taggraphDir = tg_output + experimentName + '/taggraph/'
    taggraphParsedBase = taggraphDir + experimentName + '_PEAKS_parsed_F'
    taggraphParsedSuffix = '_TAGGRAPH.tdv'
    for currFraction in xrange(1, numFractions + 1):
        currTaggraphFractionFile = taggraphParsedBase + str(
            currFraction) + taggraphParsedSuffix
        if not os.path.exists(currTaggraphFractionFile):
            fatalError = True
            write2FileStdout(
                fh,
                '** FAILURE ** Missing Parsed TagGraph Fraction File \'%s\' !!'
                % (currTaggraphFractionFile))
        else:
            write2FileStdout(
                fh, '* Found Required Parsed TagGraph Fraction File \'%s\'' %
                (currTaggraphFractionFile))
    write2FileStdout(fh, "\n" + MSGBORDER)
    write2FileStdout(fh, '*** END VERIFYING TAGGRAPH OUTPUTS in runTG.py')
    write2FileStdout(fh, MSGBORDER)
    ### END VERIFY TG RUN ###
    ### If a fatal error was thrown, do not proceed ###
    if fatalError == True:
        write2FileStdout(
            fh,
            '*****  HALTING DUE TO FATAL ERROR IN VERIFYING TAGGRAPH RUN, SEE OUTPUT ABOVE!!'
        )
        sys.exit(1)
    ## Copy configuration file to output tree for safe keeping ##
    configFileBaseName = os.path.basename(configFileName)
    checkConfigDestination = tg_output
    if os.path.exists(checkConfigDestination + configFileBaseName):
        write2FileStdout(
            fh,
            '** WARNING ** config file \'%s\' already exists in output directory \'%s\''
            % (configFileBaseName, checkConfigDestination))
    else:
        shutil.copy(configFileName, checkConfigDestination)
        write2FileStdout(
            fh,
            '* Successfully copied Configuration File \'%s\' to Output Directory \'%s\''
            % (configFileName, checkConfigDestination))
    ## Lets set up the args properly for ComputeEMProbabilitiesFromDB.py ##
    '''
    -i: same as TG -i parameter
    -F all
    -M 100
    -C 20
    -B = <-o parameter from TG>/results.db [checked after TG runs]
    -E: Same as TG ExperimentName parameter.
    -o: Output Prefix, will create files with the prefix <EM -o parameter> in the directory specified by the <TG -o parameter>
    '''
    em_init = tg_init
    em_fractions = 'all'  ## EMFractions is always 'all' now! ## = str(expectationMaximizationSectionMap['EMFractions'])
    em_maxIterations = str(expectationMaximizationSectionMap['maxIterations'])
    em_initIterations = str(
        expectationMaximizationSectionMap['initIterations'])
    em_dbLocation = tg_output + 'results.db'
    em_experimentName = tagGraphSectionMap['ExperimentName']
    em_output = tg_output
    if not em_output.endswith('/'):
        em_output += '/'
    em_output += emResultsPrefix
    emArgs = []
    emArgs.extend(['-i', '\"' + em_init + '\"'])
    emArgs.extend(['-F', '\"' + em_fractions + '\"'])
    emArgs.extend(['-M', '\"' + em_maxIterations + '\"'])
    emArgs.extend(['-C', '\"' + em_initIterations + '\"'])
    emArgs.extend(['-B', '\"' + em_dbLocation + '\"'])
    emArgs.extend(['-E', '\"' + em_experimentName + '\"'])
    emArgs.extend(['-o', '\"' + em_output + '\"'])
    write2FileStdout(fh, 'EM ARGS: %s\n' % emArgs)
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(
        fh, '*** CALLING ComputeEMProbabilitiesFromDB.py from runTG.py')
    write2FileStdout(fh, MSGBORDER + "\n")
    DataFile.executeProcess(SCRIPTS_DIR, 'ComputeEMProbabilitiesFromDB.py',
                            emArgs)
    write2FileStdout(
        fh, '*** command executed: python ComputeEMProbabilitiesFromDB.py %s' %
        emArgs)
    write2FileStdout(fh, "\n" + MSGBORDER)
    write2FileStdout(
        fh, '*** END CALLING ComputeEMProbabilitiesFromDB.py from runTG.py')
    write2FileStdout(fh, MSGBORDER)
    EMProbs_TOPONLY = tg_output + 'EM_Results_EMProbs_END_TOPONLY.tdv'
    if not os.path.exists(EMProbs_TOPONLY):
        fatalError = True
        write2FileStdout(
            fh, '** FAILURE ** Missing EMProbs END TOPONLY file \'%s\'.' %
            (EMProbs_TOPONLY))
        sys.exit(1)
    else:
        write2FileStdout(
            fh, '* Found EMProbs END TOPONLY file \'%s\'' % (EMProbs_TOPONLY))
    write2FileStdout(fh, "\n\n" + MSGBORDER)
    write2FileStdout(fh, '*** CALLING verify EM result tests from runTG.py')
    write2FileStdout(fh, "\ntime now: @ %s" % datetime.datetime.now())
    result = verifyEM.verifyEM(tg_output)
    write2FileStdout(fh, result)
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, "\ntime now: @ %s" % datetime.datetime.now())
    write2FileStdout(fh,
                     '*** END CALLING verify EM result tests from runTG.py')
    write2FileStdout(fh, MSGBORDER)
    topResultsFile = tg_output + experimentName + '_TopResults.tdv'
    if not os.path.exists(topResultsFile):
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Missing TopResult file \'%s\'.' % (topResultsFile))
        sys.exit(1)
    else:
        write2FileStdout(fh,
                         '* Found TopResult file \'%s\'' % (topResultsFile))
    outputPerFraction = "No"
    write2FileStdout(
        fh,
        '**** start parseResultsDB process: %s' % (datetime.datetime.now()))
    FDRCutoff = 0.01
    logEMCutoff = 100
    DisplayProteinNum = 5
    if "outputPerFraction" in generalSectionMap:
        if True == theConfig.getboolean('General', 'outputPerFraction'):
            outputPerFraction = "Yes"
    if "FDRCutoff" in generalSectionMap:
        if isNumeric(generalSectionMap["FDRCutoff"]):
            write2FileStdout(
                fh, '* Found  Numeric TagGraph Parameter \'%s\'  : \'%s\'' %
                ("FDRCutoff", generalSectionMap["FDRCutoff"]))
        FDRCutoff = generalSectionMap['FDRCutoff']
    if "logEMCutoff" in generalSectionMap:
        if isNumeric(generalSectionMap["logEMCutoff"]):
            write2FileStdout(
                fh, '* Found  Numeric TagGraph Parameter \'%s\'  : \'%s\'' %
                ("logEMCutoff", generalSectionMap["logEMCutoff"]))
        logEMCutoff = generalSectionMap['logEMCutoff']
    if "DisplayProteinNum" in generalSectionMap:
        if isNumeric(generalSectionMap["DisplayProteinNum"]):
            write2FileStdout(
                fh, '* Found  Numeric TagGraph Parameter \'%s\'  : \'%s\'' %
                ("DisplayProteinNum", generalSectionMap["DisplayProteinNum"]))
        DisplayProteinNum = generalSectionMap['DisplayProteinNum']
    writeTopArgs = []
    writeTopArgs.extend(['\"' + tg_output + '\"'])
    writeTopArgs.extend(['\"' + tg_init + '\"'])
    writeTopArgs.extend(['\"' + outputPerFraction + '\"'])
    writeTopArgs.extend(['\"' + str(FDRCutoff) + '\"'])
    writeTopArgs.extend(['\"' + str(logEMCutoff) + '\"'])
    writeTopArgs.extend(['\"' + str(DisplayProteinNum) + '\"'])
    ## Now lets parse the original TG tab-delimted format ##
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, '*** CALLING parseResultsDB.py from runTG.py')
    write2FileStdout(fh, MSGBORDER + "\n")
    DataFile.executeProcess(SCRIPTS_DIR, 'parseResultsDB.py', writeTopArgs)
    write2FileStdout(
        fh, '*** command executed: python parseResultsDB.py %s' % writeTopArgs)
    write2FileStdout(fh, "\n" + MSGBORDER)
    write2FileStdout(fh, '*** END CALLING parseResultsDB.py from runTG.py')
    write2FileStdout(
        fh, '**** done parseResultsDB process: %s' % (datetime.datetime.now()))
    write2FileStdout(fh, MSGBORDER)
    topResultsFinalFile = tg_output + experimentName + '_TopResults*.txt'
    foundFile = 0
    if len(glob.glob(topResultsFinalFile)) > 0:
        foundFile = 1
    if foundFile == 0:
        fatalError = True
        write2FileStdout(
            fh,
            '** FAILURE ** Missing result file \'%s\' from parseResultsDB.py process. Please check.'
            % (topResultsFinalFile))
        sys.exit(1)
    if 'generatePepXML' in generalSectionMap:
        if True == theConfig.getboolean('General', 'generatePepXML'):
            ## Now lets generate the output in PepXML format ##
            '''
            python -u /lab/scratch/taggraph_sarah/taggraphsourcecode/database/resultPepXML.py \
            tg_init-i /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/TAG_GRAPH_Tryp_CysCarbam_MetOx.ini \
            tg_ppmstd-p 10 \
            tg_modtolerance-l 0.1 \
            tg_maxcounts-M 400 \
            tg_modmaxcounts-C 200 \
            tg_fmindex-f /var/www/html/TAG_GRAPH/lib/databases/20141209_UniHUMAN_cRAP_ILEq.fm \
            tg_model-m /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/AllChargeDist_posOnlyDependence_20150808_HumanProt500000.pck \
            xxxx-c /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/AllChargeDist_posOnlyDependence_20150808.txt \
            tg_unimoddict-Q /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/unimodDict_noLabels.pck \
            tg_output-o /lab/samba/shared/Users/Sarah/taggraph/testmzml/output/ \
            tg_dtadir-d /lab/samba/shared/Users/Sarah/taggraph/testmzml \
            >& /lab/samba/shared/Users/Sarah/taggraph/testmzml/OutputpepXML.txt
            '''
            pepArgs = []
            pepArgs.extend(['\"' + tg_init + '\"'])
            pepArgs.extend(['\"' + tg_ppmstd + '\"'])
            pepArgs.extend(['\"' + tg_modtolerance + '\"'])
            pepArgs.extend(['\"' + tg_maxcounts + '\"'])
            pepArgs.extend(['\"' + tg_modmaxcounts + '\"'])
            pepArgs.extend(['\"' + tg_fmindex + '\"'
                            ])  # tagGraphSectionMap['fmindex']
            pepArgs.extend(['\"' + tg_model + '\"'
                            ])  # tagGraphSectionMap['model']
            #pepArgs.extend(['\"' + tg_config + '\"'])  # tagGraphSectionMap['config']
            pepArgs.extend(['\"' + tg_unimoddict + '\"'
                            ])  # tagGraphSectionMap['unimoddict']
            pepArgs.extend(['\"' + tg_output + '\"'
                            ])  # tagGraphSectionMap['output']
            pepArgs.extend(['\"' + tg_dtadir + '\"'])  # symLinkDir
            pepArgs.extend(['\"' + str(FDRCutoff) + '\"'])
            pepArgs.extend(['\"' + str(logEMCutoff) + '\"'])
            write2FileStdout(fh, MSGBORDER)
            write2FileStdout(
                fh, '*** CALLING generatePepXMLDBperFrac.py from runTG.py')
            write2FileStdout(fh, MSGBORDER + "\n")
            DataFile.executeProcess(SCRIPTS_DIR, 'generatePepXMLDBperFrac.py',
                                    pepArgs)
            write2FileStdout(
                fh,
                '*** command: python generatePepXMLDBperFrac.py %s' % pepArgs)
            write2FileStdout(fh, "\n" + MSGBORDER)
            write2FileStdout(
                fh, '*** END CALLING generatePepXMLDBperFrac.py from runTG.py')
            write2FileStdout(fh, MSGBORDER)
            '''
            Now lets clean up our temporary items and copied data files as configured! ###
            We need to:
            * Remove the sym-link directory in /tmp/ (symLinkDir)
            * If cleanMzDataFilesFromOutput is True, clean the dataDir (<output_dir>/<experiment_name>/data/)
            directory of mz[X]ML files and the DTA directories of the same name
            '''
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, '***    CLEANING UP')
    write2FileStdout(fh, MSGBORDER)
    ### Remove the sym-link directory in /tmp/ (symLinkDir)
    shutil.rmtree(symLinkDir)
    if os.path.exists(symLinkDir):
        write2FileStdout(
            fh,
            '** FAILURE ** Failed to removed temporary symbolic link directory \'%s\''
            % (symLinkDir))
    else:
        write2FileStdout(
            fh,
            '* Successfully removed temporary symbolic link directory \'%s\'' %
            (symLinkDir))
    if 'cleanInputDataFilesFromOutput' in generalSectionMap:
        if True == theConfig.getboolean('General',
                                        'cleanInputDataFilesFromOutput'):
            shutil.rmtree(dataDir)
            #os.makedirs(dataDir)
            write2FileStdout(
                fh,
                '* Removed mz[X]ML and DTA files from data directory \'%s\' (cleanInputDataFilesFromOuput is True)'
                % (dataDir))
        else:
            write2FileStdout(
                fh,
                '* Leaving mz[X]ML and DTA files in data directory \'%s\' (cleanInputDataFilesFromOuput is False)'
                % (dataDir))
    if 'cleanIntermediateFiles' in generalSectionMap:
        denovoOutputDir = tg_output + '/' + experimentName + '/de_novo/'
        taggraphOutputDir = tg_output + '/' + experimentName + '/taggraph/'
        experimentOutputDir = tg_output + '/' + experimentName
        if True == theConfig.getboolean('General', 'cleanIntermediateFiles'):
            shutil.rmtree(denovoOutputDir)
            shutil.rmtree(taggraphOutputDir)
            if os.path.exists(dataDir):
                shutil.rmtree(dataDir)
            shutil.rmtree(experimentOutputDir)
            files = os.listdir(tg_output)
            for file in files:
                if (file.endswith(".tdv") or (file.find("_CHECK.txt.") > 0)
                        or file.endswith(".db") or file.endswith(".log")):
                    if (os.path.exists(os.path.join(tg_output, file))):
                        write2FileStdout(
                            fh, "remove %s" % os.path.join(tg_output, file))
                        os.remove(os.path.join(tg_output, file))
                    else:
                        write2FileStdout(
                            fh, "keeper %s" % os.path.join(tg_output, file))
            write2FileStdout(
                fh,
                '* Removed mz[X]ML and Intermediate files from output directory \'%s\' (cleanIntermediateFiles is True)'
                % (dataDir))
        else:
            write2FileStdout(
                fh,
                '* Leaving mz[X]ML and Intermediate files in output directory \'%s\' (cleanIntermediateFiles is False)'
                % (dataDir))
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, '***  END CLEANING UP')
    write2FileStdout(fh, MSGBORDER)
    write2FileStdout(fh, '%s' % TAGGRAPH_CONFIG_FOOTER)
    write2FileStdout(
        fh, '**** end TagGraph process: %s' % (datetime.datetime.now()))
    fh.close()
    #move file back to output folder:
    toDest = tg_output + "runReport.log"
    shutil.move(runCapture, toDest)
    sys.exit(0)
Example #44
0
from DataFile import *

file = DataFile("testCsv.csv")
print(str(file.getRowCout()))
# file.appendEntry("g;Lukas;Bern")
print(str(file.getRowCout()))
file.removeEntryByColumnAndIdentifier(0,'g')
def getSpectrumAndPSMFeatureDict(LADSSeqInfo, seqEntry, scanFDict, pairConfig, PNet):

    featureList = []
    lightScans = seqEntry[0]
    heavyScans = seqEntry[1]
    
    lightSpecs = [DataFile.getMassIntPairs(scanFDict[int(lightScanF)]['dta']) for lightScanF in lightScans]
    heavySpecs = [DataFile.getMassIntPairs(scanFDict[int(heavyScanF)]['dta']) for heavyScanF in heavyScans]
    avgLightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in lightScans]))
    
    epSTD = options.ppmstd * 10**-6 * avgLightPrecMass
    
    specs = []
    for i, massIntPairs in enumerate(lightSpecs):
        specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)]
    for i, massIntPairs in enumerate(heavySpecs):
        specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)]
    for spec in specs:
        spec.initializeNoiseModel()
                                                                                                                                                    
    clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD)
    GLFD.addClusterPairingStatsToFeatureList(clusterPairingStats, featureList)

    scoreStats = {}
    truePMs = {}
    prmLadders = {}
    for PSM in LADSSeqInfo[seqEntry]:
        lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2])
        scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2])

        prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True)
        truePMs[PSM[:2]] = prmLadderWithEnds[-1]
        prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1]
        
    PSMList = scoreStats.keys()
    spectrumOrderedScoreStats, clusterScoreStats = GLFD.compileScoreStats(scoreStats, specs, PSMList)

    spectrumAndPSMSpecificFeatureDict = {}
        
    PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)])
    for i, PSM in enumerate(LADSSeqInfo[seqEntry]):
        PSMSpecificFeatureList = copy.copy(featureList)

        peptLength = len(prmLadders[PSM[:2]]) + 1

        # Add LADS PScore (and normalized variants)  and delta rank, delta score (LADS PScore) to feature list
        PSMSpecificFeatureList += [PSM[0], PSM[0]/peptLength, PSM[0]/len(specs), -i, PSM[0]-LADSSeqInfo[seqEntry][0][0]]
        # Add Total Path Score (and normalized variants) and delta rank, delta score (total path score)  and total minimum node score to feature list
        totalPathScore = scoreStats[PSM[:2]]['Total Path Score']
        PSMSpecificFeatureList += [totalPathScore, totalPathScore/peptLength, totalPathScore/len(specs), -clusterScoreStats['PSM Rankings'][PSMIndexDict[PSM[:2]]], totalPathScore-clusterScoreStats['Max Cluster Path Score'], scoreStats[PSM[:2]]['Total Minimum Node Score']]
        
        # Add minimum path score, maximum path score, (and normalized variants) and minimum score/maximum score for cluster to feature list
        PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Minimum Path Score'], scoreStats[PSM[:2]]['Minimum Path Score']/peptLength, scoreStats[PSM[:2]]['Maximum Path Score'], scoreStats[PSM[:2]]['Maximum Path Score']/peptLength, scoreStats[PSM[:2]]['Minimum Path Score']/scoreStats[PSM[:2]]['Maximum Path Score']]
        
        # Add difference between minimum and maximum ranking for PSM across cluster to feature list
        rankingsForPSM = [spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]] for i in spectrumOrderedScoreStats]
        PSMSpecificFeatureList += [min(rankingsForPSM) - max(rankingsForPSM)]
        
        #Add Number forbidden node pairs (and normalized variants) to feature list
        numForbiddenPairs = Discriminator.getNumForbiddenPairs(prmLadders[PSM[:2]], avgLightPrecMass)
        PSMSpecificFeatureList += [numForbiddenPairs, 2.0*numForbiddenPairs/(peptLength-1)]

        # Add number of ambiguous edges to feature list
        PSMSpecificFeatureList += [len(PSM[2])]
        
        # Add stats for PRM Evidence over cluster (and normalized variants) to feature list
        PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence']/float(peptLength-1)]

        # Add stats for paired PRMs and their corresponding ion types to feature list
        pairedPRMStats = Discriminator.getPairedPRMStats(prmLadders[PSM[:2]], clusterPairingStats['Light Merged Spec'], clusterPairingStats['Heavy Merged Spec'], lightSpecs, heavySpecs, clusterPairingStats['Cluster Paired PRM Information'], epSTD=epSTD)
        GLFD.addPairedPRMStatsToFeatureList(pairedPRMStats, PSMSpecificFeatureList, len(prmLadders[PSM[:2]]))

        pairedPRMLadder = pairedPRMStats['Paired PRM Ladder']        
    
        for i, scan in enumerate(lightScans):
            spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList)
            # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list
            pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]]
            numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]]
            spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]]
            
            # Add mass deviation from true peptide mass to feature list
            precMass = scanFDict[scan]['precMass']
            spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)]
        
            peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet)
            GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength)
            GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList)
        
            spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength]
            spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList

        for j, scan in enumerate(heavyScans):
            i = j + len(lightScans)
            
            spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList)
            # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list
            pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]]
            numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]]
            spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]]
            
            # Add mass deviation from true peptide mass to feature list
            precMass = scanFDict[scan]['precMass']
            spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + pairConfig['NMod'] + pairConfig['CMod'] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)]
            
            peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet)
            GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength)
            GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList)
            
            spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength]
            spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList

    return spectrumAndPSMSpecificFeatureDict
Example #46
0
    return np.unique(scanFs)


if __name__ == '__main__':
    options = ArgLib.parse([
        'init', 'lads', 'sequest', 'mascot', 'pepnovo', 'output', 'database',
        'symbolmap', 'pnovo', 'peaks', 'combined'
    ])

    paramsDict = ArgLib.parseInitFile(options.init, options)
    progDict = ArgLib.getProgDict(DataFile.searchprogs, options)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)

    seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict)

    dbDict = DataFile.getDBInfo(options.database)
    processedInfo = {}
    if options.lads:
        LADSdict = eval(options.lads)
        for tdvfile in LADSdict.keys():
            LADSScanInfo = DataFile.getScanInfo(tdvfile,
                                                dbDict['LADS']['fields'],
                                                delimiter='\t')
            processedInfo[LADSdict[tdvfile]] = DataFile.preprocessLADSScanInfo(
                LADSScanInfo, seqMap[LADSdict[tdvfile]],
                paramsDict['Pair Configurations'], dbDict['LADS']['fieldmap'])

    if options.pepnovo:
        pepNovoDict = eval(options.pepnovo)
def addSingleModCandidates(scanF,
                           scan_items,
                           mod_lists,
                           mod_ranges_list,
                           mod_tuples_list,
                           enumerated_mods,
                           scanFDict,
                           expand_mods,
                           hashed_single_mods,
                           prob_network,
                           ep_step=0.01,
                           mod_tolerance=0.1,
                           ppmSTD=10,
                           aa_spread=3):
    #print enumerated_mods
    #for i in range(len(scan_items)):
    #    print scan_items[i]
    #    print mod_lists[i]
    #    print mod_tuples_list[i]
    #    print '-----------------'

    candidates_map = {}
    mod_range_map = {}
    # print enumerated_mods
    for i, item in enumerate(scan_items):
        # print 'Scan', scan_items[i]
        for j, mod in enumerate(mod_lists[i]):

            if mod[0][0] == 'Isobaric Substitution':
                continue

            context = item[1]
            seq_without_ends = context[2:-2]
            mod_range = mod_ranges_list[i][j]

            if (context, mod_range, mod[0][0]) in candidates_map:
                continue

            if j > 0:
                start_cut = max(
                    [mod_ranges_list[i][j - 1][1], mod_range[0] - aa_spread])
            else:
                start_cut = max([0, mod_range[0] - aa_spread])

            try:
                end_cut = min(
                    [mod_range[1] + aa_spread, mod_ranges_list[i][j + 1][0]])
            except IndexError:
                end_cut = min(
                    [mod_range[1] + aa_spread,
                     len(seq_without_ends)])

            repl_mods = []
            replace_seqs = []
            mod_class = Validator.getModClass([mod_tuples_list[i][j]])
            #has_defined_mod = any([ Validator.getModClass([mod_tuple]) == 1 for mod_tuple in enumerated_mods[(context, mod_range)] ])
            # Don't expand if mod is an AASub and there already exists a defined mod for the same mod_range/context
            #if mod_class > 1 and has_defined_mod:
            #    continue

            # print 'Mod', j, mod, start_cut, end_cut
            for repl_mod, error in hashed_single_mods[hashMass(
                    mod[0][1], ep_step)]:
                # Don't enumerate mod if already enumerated or if mod error exceeds mod_tolerance
                repl_mod_error = (0 if not mod[0][2] else mod[0][2]) + error
                # print repl_mod, repl_mod_error, repl_mod[0],  mod_tuples_list[i][j], mod_tuples_list[i][j][0], expand_mods[repl_mod], expand_mods[mod_tuples_list[i][j]]
                # print enumerated_mods[(context, mod_range)], context, mod_range
                if repl_mod[
                        0] == 'undefined mass shift' or repl_mod in enumerated_mods[
                            (context, mod_range
                             )] or abs(repl_mod_error) > mod_tolerance:
                    # Don't add candidate if candidate already exists for scan item or mod_error of replacement exceeds tolerance or if it is an undefined mod
                    continue
                elif mod_class == 1 and not (
                        repl_mod[0] == mod_tuples_list[i][j][0]
                        and expand_mods[repl_mod] >
                        expand_mods[mod_tuples_list[i][j]]):
                    # If mod is a defined mod, only expand if the replacement mod has the same name but a different and more prevalent localization
                    continue

                # print 'replace candidate', mod, repl_mod, error, repl_mod_error, context, seq_without_ends[start_cut:end_cut]
                repl_mods += [(repl_mod, repl_mod_error)]

            term = getTerminus(start_cut, end_cut, seq_without_ends)
            subseq = seq_without_ends[start_cut:end_cut]
            for repl_mod, error in repl_mods:
                # add candidates
                try:
                    locs = getModLocs(subseq, term, repl_mod)
                    for loc in locs[1]:
                        replace_seqs += getSingleModSeq(
                            subseq, repl_mod, loc, error)
                except IndexError:
                    # Index error occurs when subseq is blank. This happens if an insertion is sandwhiched between two mods exactly or is sandwhiched between a mod and a terminus. Doesn't happen normally, but can happen in special cases, such as when the first amino acid in the context is an oxidized methionine and the Insertion is N-terminal to this
                    # TODO: See if this case happens frequently enough with carbamidomethyls to mess up the results
                    pass

            if len(replace_seqs) > 0:
                candidates_map[(context, mod_range, mod[0][0])] = replace_seqs
                mod_range_map[(context, mod_range)] = (start_cut, end_cut)

    # print candidates_map
    new_scan_items = []
    if len(candidates_map) > 0 and scanF in scanFDict:
        precMass = scanFDict[scanF]['precMass']
        epSTD = ppmSTD * precMass * 10**-6
        spec = PN.Spectrum(prob_network,
                           precMass,
                           Nmod=0.0,
                           Cmod=0.0,
                           epsilon=2 * epSTD,
                           spectrum=DataFile.getMassIntPairs(
                               scanFDict[scanF]['dta']),
                           useMemo=True)
        spec.initializeNoiseModel()

        # Generate new peptide candidates
        for i, item in enumerate(scan_items):
            for j, mod_range in enumerate(mod_ranges_list[i]):
                # add candidate

                if (item[1], mod_range,
                        mod_lists[i][j][0][0]) in candidates_map:
                    # print '----------------'
                    # print item[1]
                    # print mod_range
                    # print mod_lists[i][j]
                    # print mod_ranges_list
                    # for candidate in candidates_map[(item[1], mod_range, mod_lists[i][j][0][0])]:
                    #     print candidate

                    new_scan_items += [
                        getSingleModCandidate(
                            spec, scanFDict[scanF]['charge'], item,
                            mod_lists[i], mod_ranges_list[i], j,
                            mod_range_map[(item[1], mod_range)], candidate[0],
                            candidate[1], candidate[2], candidate[3])
                        for candidate in candidates_map[(
                            item[1], mod_range, mod_lists[i][j][0][0])]
                    ]

    #print 'Num Single Mods Before getUniqueCandidates', len(new_scan_items)
    return getUniqueCandidates(new_scan_items)
Example #48
0
    dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\'
    dtaNames = DataFile.getDTAFNamesInDir(dirPath)

    scansIter = iter(dtaNames)
    currScanInfo = scansIter.next()
    for dta in dtaNames:
        precMass = DataFile.getPrecMassAndCharge(dta)[0]
        spectra = DataFile.getMassIntPairs(dta)
        S = Spectrum(PN, precMass, 0.0, 0.0, spectra)
        corr = S.correctParentMass()
        if np.abs(corr) > 0.04:
            print dta, corr

    """
    paramsDict = DataFile.parseParams(
        '/home/arun/Documents/LADS_SILAC_Trypsin.ini')
    print getPRMLadder('A', 0)
    """
    heavyPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3367.3367.1.dta"
    lightPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3383.3383.1.dta"
    heavyPairs = DataFile.getMassIntPairs(heavyPath)
    lightPairs = DataFile.getMassIntPairs(lightPath)
    heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath)
    lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath)

    heavySpec = Spectrum(PN, heavyPrecMass, 0, mods['*'], heavyPairs)
    lightSpec = Spectrum(PN, lightPrecMass, 0, 0, lightPairs)
    heavySpec.initializeNoiseModel()
    lightSpec.initializeNoiseModel()
    print heavySpec.noiseModel
    print lightSpec.noiseModel
Example #49
0
import requests
import time
import os, sys
from bs4 import BeautifulSoup
import DataFile
import time
import Mail
import Template
from ast import literal_eval
from itertools import chain

url_prefix = "http://tj01.tupu.hb.ted:28026"

get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled_tupu.txt"
word_file = "./word_tupurec"
word_list = DataFile.read_file_into_list("./word_tupurec")
result_file = './tupurec_garbled_result'
report_tmp_path = "mail_detail.html"
mail_to = "*****@*****.**"

f_res = open(result_file, 'w', encoding='utf8')


def log_info(str):
    time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    sys.stdout.write('[%s] [info] %s\n' % (time_str, str))
    sys.stdout.flush()


def utf8stdout(in_str):
    utf8stdout = open(1, 'w', encoding='utf-8',
def getSharedPeaksRatio(lightPath, heavyPath, epsilon):
    lightPairs = DataFile.getMassIntPairs(lightPath)
    heavyPairs = DataFile.getMassIntPairs(heavyPath)
    N, C = SA.getNandCIons(lightPairs, heavyPairs, pairConfig['NMod'], pairConfig['CMod'], epsilon=epsilon)
    return SA.getSharedPeaksRatio(lightPairs, heavyPairs, N, C)
        p.join()

    for l in L:
        for j in l:
            outFile.write(str(j) + '\t')
        outFile.write('\n')

if __name__ == '__main__' :
    options = ArgLib.parse(['init', 'dtadir', 'config', 'model', 'output', 'columns', 'verbose', 'paircutoff', 'ppmsyserror', 'ppmstd', 'ppmpenalty', 'ambigpenalty', 'minedge', 'maxedge', 'alpha', 'subgraphcut', 'symbolmap'])
    epStep = 0.00025
    maxEp = 0.1
    
    paramsDict = ArgLib.parseInitFile(options.init, options)
    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    
    if options.columns:
        with open(options.columns) as fin:
            cols = pickle.load(fin)
    else:
        print 'Using default cols'
        cols = ['light scan', 'heavy scan', 'pair configuration', 'M+H', 'score', 'seq', 'epsilon', 'ambiguous edges', 'num ambig edges']
    
    if options.output:
        outFile = open(options.output, 'w')
        outFile.write('\t'.join([col.upper() for col in cols]) + '\n')
    
    PNet = PN.ProbNetwork(options.config, options.model)

    dtaList = glob.glob(options.dtadir + '/*.dta')
Example #52
0
    dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\'
    dtaNames = DataFile.getDTAFNamesInDir(dirPath)
    
    scansIter = iter(dtaNames)
    currScanInfo = scansIter.next()
    for dta in dtaNames:
        precMass = DataFile.getPrecMassAndCharge(dta)[0]
        spectra = DataFile.getMassIntPairs(dta)
        S = Spectrum(PN, precMass, 0.0, 0.0, spectra)
        corr = S.correctParentMass()
        if np.abs(corr) > 0.04:
            print dta, corr
        
    """
    paramsDict = DataFile.parseParams('/home/arun/Documents/LADS_SILAC_Trypsin.ini')
    print getPRMLadder('A', 0)
    """
    heavyPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3367.3367.1.dta"
    lightPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3383.3383.1.dta"
    heavyPairs = DataFile.getMassIntPairs(heavyPath)
    lightPairs = DataFile.getMassIntPairs(lightPath)
    heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath) 
    lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath)  
    
    heavySpec = Spectrum(PN, heavyPrecMass, 0, mods['*'], heavyPairs)
    lightSpec = Spectrum(PN, lightPrecMass, 0, 0, lightPairs)
    heavySpec.initializeNoiseModel()
    lightSpec.initializeNoiseModel()
    print heavySpec.noiseModel
    print lightSpec.noiseModel
    for i, feature in enumerate(featureNames):
        print '%i. %s: %f' % (i+1, feature, featureList[i])

if __name__ == '__main__':
    print 'This program generates LETOR format training data for the training of a discriminator. dtadir is of the formate {/loc of dtadir: (loc of LADS SequenceDTAsTDV.py LOG file, loc of combined SEQUEST-MASCOT database results'
    options = ArgLib.parse(['init', 'dtadir', 'ppmstd', 'symbolmap', 'output', 'model', 'config'])

    paramsDict = ArgLib.parseInitFile(options.init, options)
    pairConfigurations = paramsDict['Pair Configurations']
    ppm = float(options.ppmstd)

    dtadirInfo = eval(options.dtadir)

    with open(options.symbolmap, 'r') as fin:
        symbolMap = pickle.load(fin)
    seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict)
    seqMap = seqMap['LADS Unit Test']

    PNet = PN.ProbNetwork(options.config, options.model)
    outFile = open(options.output, 'w')

    featureNames = generateFeatureNames(PNet)
    #printFeatureNames(featureNames)

    heavySeqMaps = {}
    for confName in pairConfigurations:
        heavySeqMaps[confName] = copy.deepcopy(seqMap)
        heavySeqMaps[confName]['Mods']['N-Term'] = pairConfigurations[confName]['NModSymbol']
        heavySeqMaps[confName]['Mods']['C-Term'] = pairConfigurations[confName]['CModSymbol']

Example #54
0
if __name__ == '__main__':
    print 'dtadir is the directory containing the mzXML files to analyze'
    print 'peaks is a dictionary mapping {experiment_name: peaks csv}'
    print 'output is the directory to move all files to and set up the project in'
    options = ArgLib.parse(['init', 'dtadir', 'peaks', 'output'])

    print 'options.output: %s' % (options.output)
    print 'normpath(options.output): %s' % (os.path.normpath(options.output))
    # Fails with an OSError if directory already exists
    os.makedirs(options.output)

    # Create database
    args = ['--sqlite', os.path.join(options.output, 'results.db')]
    print 'Models.py dir: %s' % (DATABASE_SCRIPT_DIR)
    DataFile.executeProcess(DATABASE_SCRIPT_DIR, 'Models.py', args)

    # Make experiment directories
    # Structure
    # /options.output
    # .../ExperimentName
    # ...../data
    # ...../de_novo
    # ...../taggraph
    for experiment, peaks_file in eval(options.peaks).items():
        experiment_dir = os.path.join(options.output, experiment)
        os.makedirs(experiment_dir)

        # Make the de_novo subdirectory
        peaks_dir = os.path.join(experiment_dir, 'de_novo')
        os.makedirs(peaks_dir)
Example #55
0
    #    print key, MASCOTInfo[key]
    """
    seq = '-K*M#--'
    ambigEdges=[(0,1000),(0,2000),(0,4000)]
    paramsDict = DataFile.parseParams('./Misc/LADS_SILAC_Trypsin.ini')
    seqMap = DataFile.generateSeqMap(['SEQUEST', 'MASCOT', 'LADS'], paramsDict)
    Constants.NTermMods['['] = 10000
    Constants.CTermMods[']'] = 20000
    Constants.NTermMods['^'] = 40000
    seqMap['LADS']['Mods']['N-Term'] = '['
    seqMap['LADS']['Mods']['$'] = '^'
    seqMap['LADS']['Mods']['C-Term'] = ']'
    #nodeGen = Constants.nodeInfoGen(seq, considerTerminalMods=True, addTerminalNodes=True, ambigEdges=ambigEdges)
    #for node in nodeGen:
        #print node
    newSeq = preprocessSequence(seq, seqMap['LADS'], replaceExistingTerminalMods=True, ambigEdges=copy.copy(ambigEdges))
    print newSeq
    print getPRMLadder(newSeq, ambigAA='X', addEnds=True, ambigEdges=copy.copy(ambigEdges))
    """
    """
    #print comparePeptideResults('AAKKIKK', 'KAAIKKK')
    dirPath = '/home/arun/Proteomics_Data/LF2_short_HCD+CID_ath001862_244'
    heavyPath = dirPath + '244.3383.3383.1.dta'
    lightPath = dirPath + '3760.0160.0160.3.dta'
    """
    dbInfo = DataFile.getDBInfo('./Misc/searchformatparams.pag')
    compInfo = DataFile.getScanInfo('/home/arun/Downloads/ath009552_ppm5_alpha0.90_min300_max500_PC0.05_SGCut300_SEQUEST_CompareSearches.py_UnitTest.tdv', delimiter='\t')
    writeFASTAFile(compInfo, 'LADS Unit Test', dbInfo['infoMap'], 'test.txt', ambigEdgeCutoff=1)

    
Example #56
0

if __name__ == '__main__':
    '''
    Maxkeys = int(input('Enter max number of keys : '))
    t = HardCodeTree(Maxkeys)
    t.PrintIndexFile()
    ans = t.SearchKeyRec(88)
    if ans:
        print(ans[0],ans[1])
    else: print('NONE')
    '''

    NumberOfRecords = int(input('Enter number of records : '))
    Maxkeys = int(input('Enter max number of keys : '))
    DataFile.createDataFile(NumberOfRecords)

    t = CreateBPlusTree('dataFile.bin', Maxkeys)
    t.PrintIndexFile()

    while True:
        try:
            key = int(input('\nEnter key to be searched : '))
            n = int(input('\nEnter number of keys to be fetched :  '))
        except:
            print('The value of key must be a positive number !!')

        keytup = t.SearchKeyRec(key)
        if keytup:
            keytup = keytup[1]
            print('FOUND !!! The given key exists as -> ', keytup)
    default='',
    help=
    'Prefix for FMIndex output files. Base of Fasta input file will be used if not supplied.'
)
parser.add_argument('fasta', help='Input FASTA filename. Must end with .fasta')
args = parser.parse_args()

if not os.path.basename(args.fasta.lower()).endswith('.fasta'):
    raise FileNotFoundError(
        "Error! FASTA input {} doesn't end with .fasta!".format(args.fasta))

if args.output == '':
    output_filename = os.path.basename(args.fasta[:-6])
    print(
        'BLANK OUTPUT basename - using the FASTA input file  base: {}'.format(
            output_filename))
    args.output = output_filename

Database.makeDBForFMIndexFromFASTA(args.fasta, args.output)

fmbuild_loc = os.path.abspath(
    os.path.join(os.path.join(PAR_DIR, 'lib'), 'fmindex'))
for fm_formatted in glob.glob(args.output + '*fmFormatted*'):
    DataFile.executeProcess(fmbuild_loc,
                            'fmbuild', [
                                '-v', fm_formatted, args.output +
                                '.fm%s' % os.path.splitext(fm_formatted)[1]
                            ],
                            interpreter=False)
    os.remove(fm_formatted)
    paramsDict = ArgLib.parseInitFile(options.init, options)

    infoDict = eval(options.mainprogname)

    with open(options.unimoddict) as fin:
        unimodDict = pickle.load(fin)
    hashedUnimodDict = hashUnimodDict(unimodDict)

    outFile = open(options.output, 'w')
    cols = ['ScanF', 'Score', 'Peptide', 'Unmod Peptide', 'References', 'Modifications', 'DB Peptide', 'Alignment Score']
    if 'Ambig Edges' in infoDict:
        cols.insert(2, 'Ambig Edges')
        
    outFile.write('\t'.join([col for col in cols]) + '\n')

    for entry in DataFile.getScanInfo(options.comp, delimiter='\t'):
        scanData = {}
        scanData['ScanF'] = entry[infoDict['ScanF']]
        scanData['Peptide'] = entry[infoDict['Peptide']]
        scanData['Unmod Peptide'] = An.stripModifications(scanData['Peptide'], noRemove=[])
        scanData['Score'] = entry[infoDict['Score']]
        scanData['Alignment Score'] = None
        
        if 'Ambig Edges' in infoDict:
            ambigEdges = eval(entry[infoDict['Ambig Edges']])
            scanData['Ambig Edges'] = ambigEdges
        else:
            ambigEdges = []
        deNovoPRMLadder = An.getPRMLadder(scanData['Peptide'], ambigEdges=ambigEdges)
                
        refList = eval(entry[infoDict['References']])
def getSharedPeaksRatio(lightPath, heavyPath, epsilon):
    lightPairs = DataFile.getMassIntPairs(lightPath)
    heavyPairs = DataFile.getMassIntPairs(heavyPath)
    N, C = SA.getNandCIons(lightPairs, heavyPairs, 17.0265, -16.0187, epsilon=epsilon)
    return SA.getSharedPeaksRatio(lightPairs, heavyPairs, N, C)
Example #60
0
    # Load positions of sequence seperators
    protein_names = []
    for seqnames_file in sorted(glob.glob(index_basename + '.seqnames*')):
        protein_names += [anydbm.open(seqnames_file)]

    with open(index_basename + '.offsets') as fin:
        protein_offsets = pickle.load(fin)

    # Load FM Index
    seq_indices = []
    for seqnames_file in sorted(glob.glob(index_basename + '.fm*')):
        seq_indices += [fm.FM_load(seqnames_file)]

    # Load de novo sequence info
    de_novo_cols, de_novo_results = DataFile.getScanInfo(options.denovo,
                                                         delimiter='\t')
    # Initialize ambiguous edges to empty list if data is not present in de novo results (i.e., not running TAG-GRAPH on LADS)
    ambig_edges_present = 'Ambig Edges' in de_novo_cols
    ambig_edges = []

    # Prep output file
    outFile = open(options.output, 'w')
    cols = [
        'ScanF', 'Alignment Score', 'Rank', 'Context', 'Modifications',
        'Proteins', 'Matching Tag Length', 'De Novo Peptide',
        'Unmod De Novo Peptide', 'De Novo Score', 'Time Taken'
    ]
    if ambig_edges_present:
        cols.insert(-3, 'Ambig Edges')
    outFile.write('\t'.join(cols) + '\n')