def findDeltaPairs(dtaList, delta, ppm=5, intEp=20): precMassArr = np.zeros((len(dtaList), 2)) pairs = [] for i in range(len(dtaList)): precMassArr[i] = [DataFile.getPrecMassAndCharge(dtaList[i])[0], DataFile.getScanNum(dtaList[i])] maxPrecMass = np.max(precMassArr, 0)[0] epsilon = ppm * 10**-6 * maxPrecMass resolution = epsilon/intEp hPrecMassArr = copy.copy(precMassArr) hPrecMassArr[:,0] = np.round(hPrecMassArr[:,0]/resolution) hashedDict = {} for elem in hPrecMassArr: hInd = int(elem[0]) for hMass in range(hInd-intEp, hInd+intEp+1): try: hashedDict[hMass] += [(hMass-hInd, elem[1])] except KeyError: hashedDict[hMass] = [(hMass-hInd, elem[1])] shiftHashDict = copy.copy(precMassArr) shiftHashDict[:,0] = np.round((shiftHashDict[:,0] - delta)/resolution) for i, elem in enumerate(shiftHashDict): hInd = int(elem[0]) if hInd in hashedDict: for possiblePair in hashedDict[hInd]: if abs(possiblePair[0]) * resolution * 10**6/precMassArr[i][0] < ppm: pairs += [(int(possiblePair[1]), int(elem[1]))] return pairs
def main(): pic_dir, nginx_dir = gen_pic_dir(pic_dir_prefix) get_word(get_word_loc, word_file) index = 1 report_content = "" mail_title = Template.html_h3_title("如下查询结果可能有乱码,请确认") mail_title = Template.html_h3_title("本次运行的截图目录为:%s" % nginx_dir) mail_res = "" for word in word_list: print("process %d word" % index) try: # ready screenshot tmp_list = word.split() query = tmp_list[0] vrid = tmp_list[1] _craw_url = url_prefix + quote(query) vrstr = 'div.vrResult[id*="sogou_vr_' + vrid + '"],div.vrResult[id*="sogou_vr_kmap_' + vrid + '"]' vrstr = quote(vrstr) picname = pic_dir + "/" + "_".join( [str(index), quote(query), vrid + ".png"]) nodejs_scrpit = 'spec-selector.js' path = '/search/odin/yinjingjing/python/garbled_detector/' child = subprocess.Popen(['/bin/node', nodejs_scrpit, \ '-t', 'android', '-m', 'css', \ '-k', vrstr, '-n', picname, \ '-u', quote(_craw_url)], shell=False, \ cwd = path, stdout=subprocess.PIPE) nodejs_res = child.stdout.read() if nodejs_res != b'0\n': utf8stdout("pupppeter ERROR. query:%s, vrid:%s, error:%s" % (query, vrid, nodejs_res)) continue else: garble, res = check_garbled(query, picname) utf8stdout( "query:%s, vrid:%s, gDetect-api result:%s, is_garble:%s" % (query, vrid, res, garble)) if garble: mail_info = "index:%d, query:%s, vrid:%s, gDetect-api result:%s" % ( index, query, vrid, res) mail_res += "<p>" + mail_info + "</p>\n" child.wait() index = index + 1 except Exception as err: print(err) index = index + 1 continue #当检测有乱码结果时才发邮件 #if mail_res: # utf8stdout("mail_res is not null, Send mail") report_content = mail_title + mail_res DataFile.write_full_file(report_tmp_path, report_content) Mail.sendMail("立知&图谱结果乱码检测", report_tmp_path, mail_to)
def getPairedAndUnpairedSpectra(dtaDir, dtaList, Nmod, Cmod, ppm=5, cutOff=0.1, verbose=False): specPairs = [] unpairedSpecs = [] delta = Nmod + Cmod for i in range(len(dtaList)): paired = False precMass1 = DataFile.getPrecMassAndCharge(dtaList[i])[0] spec1 = DataFile.getMassIntPairs(dtaList[i]) for j in range(i + 1, len(dtaList)): precMass2 = DataFile.getPrecMassAndCharge(dtaList[j])[0] epsilon = ppm * 10 ** -6 * max(precMass1, precMass2) if np.abs(np.abs(precMass1 - precMass2) - delta) < epsilon: spec2 = DataFile.getMassIntPairs(dtaList[j]) if precMass1 < precMass2: N, C = SA.getNandCIons(spec1, spec2, Nmod, Cmod, epsilon=epsilon) ratio = SA.getSharedPeaksRatio(spec1, spec2, N, C) else: N, C = SA.getNandCIons(spec2, spec1, Nmod, Cmod, epsilon=epsilon) ratio = SA.getSharedPeaksRatio(spec2, spec1, N, C) if ratio > cutOff: if verbose: print 'Pair found', dtaList[i], dtaList[j] paired = True specs = (dtaList[i], dtaList[j]) lightInd = int(precMass2 < precMass1) specPairs.extend([(ratio, specs[lightInd], specs[1 - lightInd])]) if not paired: unpairedSpecs.extend([dtaList[i]]) if verbose: print 'No pairs for', dtaList[i] return specPairs, unpairedSpecs
def getAlignmentRatios(scanInfoFName, dtaDir, delta, epsilon=0.02): scanInfo = DataFile.getScanInfo(scanInfoFName) dtaNames = DataFile.getDTAFNamesInDir(dtaDir) scansToUse = scanInfo """ for i in range(len(scanInfo) - 1): if (int(scanInfo[i][0]) + 1 == int(scanInfo[i+1][0])): if (scanInfo[i][1] == scanInfo[i+1][1]): scansToUse += [scanInfo[i]] else: scansToUse += [scanInfo[i]] """ ratios = [] goodRatios = [] for i in range(len(scansToUse)): for j in range(i + 1, len(scansToUse)): if j == i + 1: print '%s percent done' % str(float(i) / len(scansToUse)) if np.abs(np.abs(float(scansToUse[i][1]) - float(scansToUse[j][1])) - delta) < epsilon: dta1 = '244.%(scanF)04i.%(scanF)04i.1.dta' % {'scanF': int(scansToUse[i][0])} dta2 = '244.%(scanF)04i.%(scanF)04i.1.dta' % {'scanF': int(scansToUse[j][0])} spec1 = DataFile.getMassIntPairs(dtaDir + dta1) spec2 = DataFile.getMassIntPairs(dtaDir + dta2) ratio = SA.getSharedPeaksRatio(float(scansToUse[i][1]), spec1, float(scansToUse[j][1]), spec2, epsilon) print ratio, scansToUse[i], scansToUse[j] ratios.extend([(ratio, scansToUse[i], scansToUse[j])]) with open('heavylightpairs.txt', 'w') as fout: pickle.dump(ratios, fout) return ratios
def findSamePrecMassClusters(dtaList, ppm=5): precMassArr = np.zeros((len(dtaList), 2)) for i in range(len(dtaList)): precMassArr[i] = [DataFile.getPrecMassAndCharge(dtaList[i])[0], DataFile.getScanNum(dtaList[i])] precMassArr = precMassArr[np.argsort(precMassArr[:,0])] clusters = [[i] for i in range(precMassArr.shape[0])] i = 0 while i < len(clusters): mergeClusters = False epsilon = ppm * 10**-6 * precMassArr[clusters[i][0]][0] for precMassInd1 in clusters[i]: for precMassInd2 in clusters[i - 1]: if (np.abs(precMassArr[precMassInd1][0] - precMassArr[precMassInd2][0]) < epsilon): mergeClusters = True break if mergeClusters: clusters[i - 1].extend(clusters[i]) del clusters[i] else: i = i + 1 scanFClusters = [] for cluster in clusters: scanFClusters += [[precMassArr[i][1] for i in cluster]] return scanFClusters
async def _action_combo_get_page_content(url, cookies_dir='data/cookies/'): try: #解析url属于那个domain parsed_uri = urlparse(url) cookies_file = "".join([cookies_dir, parsed_uri.netloc, "cookie"]) my_cookie_file = DataFile.read_file_intostr(cookies_file) browser = await launch({ "executablePath": "chromium-browser", "args": ["--no-sandbox"] }) page = await browser.newPage() #读取cookies if (len(my_cookie_file) > 0): my_cookie_object = json.loads(my_cookie_file) print("".join( ["Load ", str(len(my_cookie_object)), " cookie item(s)."])) for row in my_cookie_object: await page.setCookie(row) #设置UA ua_box = UserAgent.UserAgentBox() await page.setUserAgent(ua_box.wap_normal_user) await page.goto(url) new_cookie = await page.cookies() json_cookie = json.dumps(new_cookie) res = await action_get_page_content(page) DataFile.write_full_file(cookies_file, json_cookie) await browser.close() return res except Exception as e: traceback.print_exc() return ""
def main(): arr_lst = ['name', 'allname', 'year'] get_word(get_word_loc, word_file) index = 1 report_content = "" mail_title = Template.html_h3_title("如下查询结果可能有乱码,请确认") mail_res = "" for word in word_list: utf8stdout("process %d word" % index) try: # ready get qa_text tmp_list = word.split() query = tmp_list[0] vrid = tmp_list[1] response = get_response(query, 'wap', '1') if not response: utf8stdout("source response is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue extract_text = get_att_name(query, response, arr_lst) utf8stdout("extract_text:%s" % extract_text) if not extract_text: utf8stdout("extract text is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue garble, res = check_garbled(query, extract_text) utf8stdout( "query:%s, vrid:%s, gDetect-api result:%s, is_garble:%s" % (query, vrid, res, garble)) if garble: f_res.write("index:%d, query:%s, vrid:%s\n" % (index, query, vrid)) f_res.write("gDetect-api result:%s\n" % res) f_res.write("extract_text:%s\n" % extract_text) f_res.write('\n') index = index + 1 except Exception as err: utf8stdout(err) index = index + 1 continue f_res.close() #当检测有乱码结果时才发邮件 if os.path.getsize(result_file) > 0: report_content = mail_title + mail_res DataFile.write_full_file(report_tmp_path, report_content) Mail.sendMail("图谱推荐结果乱码检测", report_tmp_path, mail_to, attachment=result_file)
def getScanFDict(dtaList): scanFDict = {} for dta in dtaList: scanF = DataFile.getScanNum(dta) precMass = DataFile.getPrecMassAndCharge(dta)[0] scanFDict[scanF] = {"dta": dta, "precMass": precMass, "sequenced": False} return scanFDict
def getScanFDict(dtaList): scanFDict = {} for dta in dtaList: scanF = DataFile.getScanNum(dta) precMass = DataFile.getPrecMassAndCharge(dta)[0] scanFDict[scanF] = {'dta': dta, 'precMass': precMass, 'sequenced': False} return scanFDict
def main(): get_word(get_word_loc, word_file) index = 1 report_content = "" mail_title = Template.html_h3_title("附件结果可能有乱码,请确认") for word in word_list: utf8stdout("process %d word" % index) try: # ready get qa_text tmp_list = word.split() query = tmp_list[0] vrid = tmp_list[1] node = Node(query, vrid) node.gen_url() node.get_response() if not node.html: utf8stdout("source html is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue node.get_qa_text() if not node.qa_text: utf8stdout("qa text is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue node.check_garbled() node_res = node.output_garble() if node_res: f_res.write("index:%d, query:%s, vrid:%s\n" % (index, query, vrid)) f_res.write("gDetect-api result:%s\n" % node.garble_res) f_res.write("qa_text:%s\n" % node.qa_text) f_res.write('\n') index = index + 1 except Exception as err: utf8stdout(err) index = index + 1 continue f_res.close() #当检测有乱码时才发送邮件 if os.path.getsize(result_file) > 0: report_content = mail_title DataFile.write_full_file(report_tmp_path, report_content) Mail.sendMail("立知问答结果乱码检测", report_tmp_path, mail_to, attachment=result_file)
def main(): get_word(get_word_loc, word_file) index = 1 report_content = "" mail_title = Template.html_h3_title("附件结果可能有乱码,请确认") for word in word_list: utf8stdout("process %d word" % index) try: # ready get qa_text tmp_list = word.split() query = tmp_list[0] vrid = tmp_list[1] html = get_response(query) if not html: utf8stdout("source html is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue qa_text = get_qa_text(query, html) if not qa_text: utf8stdout("qa text is null. query:%s, vrid:%s" % (query, vrid)) index = index + 1 continue garble, res = check_garbled(query, qa_text) utf8stdout("query:%s, vrid:%s, is_garble:%s" % (query, vrid, garble)) utf8stdout("gDetect-api result:%s" % res) utf8stdout("qa_text:%s" % qa_text) if garble: f_res.write("index:%d, query:%s, vrid:%s\n" % (index, query, vrid)) f_res.write("gDetect-api result:%s\n" % res) f_res.write("qa_text:%s\n" % qa_text) f_res.write('\n') index = index + 1 except Exception as err: utf8stdout(err) index = index + 1 continue #当检测有乱码结果时才发邮件 #if mail_res: # utf8stdout("mail_res is not null, Send mail") f_res.close() report_content = mail_title DataFile.write_full_file(report_tmp_path, report_content) Mail.sendMail("立知问答结果乱码检测", report_tmp_path, mail_to, attachment=result_file)
def send_mail(task_id): try: report_content = "" url = "http://fs.sogou/lizhi_accu_compare/mission_list/" + str( task_id) + "/" mail_title = Template.html_h3_title("立知结果精度对比运行完毕,请对结果进行标注:") mail_content = Template.html_p(url) report_content = mail_title + mail_content DataFile.write_full_file(report_tmp_path, report_content) Mail.sendMail("立知结果精度对比运行完毕,请对结果进行标注", report_tmp_path, mail_to) except Exception as err: print("[send_mail]:%s" % err)
def getPairs(pairs, xVals): for pair in pairs: lightSpecs = [DataFile.getMassIntPairs(scanFDict[lightScanF]['dta']) for lightScanF in samePeptideClusters[pair[0]]] heavySpecs = [DataFile.getMassIntPairs(scanFDict[heavyScanF]['dta']) for heavyScanF in samePeptideClusters[pair[1]]] lightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in samePeptideClusters[pair[0]]])) epSTD = (float(paramsDict['ppmstd']['value'])) * 10 ** -6 * lightPrecMass lightMergedSpec = SA.mergeSpectra(lightSpecs, epsilon=2*epSTD) heavyMergedSpec = SA.mergeSpectra(heavySpecs, epsilon=2*epSTD) svmClassificationData = SA.getSpectraPairInfoForSVMClassification(lightMergedSpec, heavyMergedSpec, lightPrecMass, NMod=pairConfig['NMod'], CMod=pairConfig['CMod'], epsilon=2*epSTD) xVals.put([svmClassificationData]) return xVals
def parseDBScans(fDict, prog, seqMap, dbDict): processedInfo = {} for csvfile in fDict.keys(): MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'], delimiter=',') processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap']) return processedInfo
def getSamePeptideClusters(precMassClusters, scanFDict, svmModel, svmRange, ppmSTD=5, cutOff=0): trueClusters = [] for cluster in precMassClusters: if len(cluster) == 1: trueClusters += [cluster] else: # print 'testing cluster', cluster pairIndex = [] xVals = [] specs = [] for i in range(len(cluster)): specs += [DataFile.getMassIntPairs(scanFDict[cluster[i]]['dta'])] dMatrix = np.ones((len(cluster), len(cluster))) * -2 for i in range(len(cluster)): for j in range(i+1, len(cluster)): epSTD = ppmSTD * 10 ** -6 * scanFDict[cluster[i]]['precMass'] SVMClassificationInfo = SA.getSpectraPairInfoForSVMClassification(specs[i], specs[j], scanFDict[cluster[i]]['precMass'], NMod=0, CMod=0, epsilon=2*epSTD) xVals += [SVMClassificationInfo] pairIndex += [(i, j)] xValsNorm = svmutil.normalize_instances(xVals, svmRange) pLabs = svmutil.svm_predict([0]*len(xValsNorm), xValsNorm, svmModel)[0] # print pLabs for i, pLab in enumerate(pLabs): # Scale distances by 4: totalTICRatio, 1: TotalSharedPeaksRatio dMatrix[pairIndex[i][0]][pairIndex[i][1]] = dMatrix[pairIndex[i][1]][pairIndex[i][0]] = xVals[i][1] if pLab==1 else -1 trueClusters += heirarchicalClusteringAverageLinkage([[scanF] for scanF in cluster], dMatrix, cutOff=cutOff) return trueClusters
def parseScans(fDict, prog, seqMap, dbDict, delimiter=',', srchID = None, seqDelimLen=2): processedInfo = {} for csvfile in fDict.keys(): MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'] + (['SrchID'] if srchID != None else []), delimiter=delimiter) processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'], srchID = srchID, seqDelimLen=seqDelimLen) return processedInfo
def analiseMedia(acao, nome): Opcao = '' Cor = '' dG = DataFile(nome, acao) sma7 = dG.getData(7) sma21 = dG.getData(21) label, sma7 = exportLastDataPlot(sma7,'SMA') label, sma21 = exportLastDataPlot(sma21, 'SMA') if sma7[-1] < sma21[-1]: opcao = 'Comprar' cor = 'green' elif sma7[-1] > sma21[-1]: opcao = 'Vender' cor = 'red' else: opcao = 'Incerteza, não faça nada' cor = 'grey' return opcao, cor, sma21[-1]
def importTAGGRAPHResults(connection, experiment_name, fraction_name, taggraph_files, max_batch_size = 500000): try: experiment_id = connection.execute(select([experiment.c.id]).where(experiment.c.name == experiment_name)).fetchone()[0] except TypeError: raise ValueError("ERROR: No experiment by name %s"%experiment_name) try: fraction_id = connection.execute(select([fraction.c.id]).where(and_(fraction.c.name == fraction_name, fraction.c.experiment_id == experiment_id))).fetchone()[0] print "Using existing fraction %s in database for experiment"%str((fraction_id, fraction_name)) except TypeError: print 'FRAC NOT FOUND, CREATING NEW FRAC', fraction_name, experiment_id res = connection.execute(fraction.insert().values(name=fraction_name, experiment_id=experiment_id)) fraction_id = res.inserted_primary_key[0] connection.execute(fraction.update().where(fraction.c.id=='fraction_id').values(taggraph_file=str(taggraph_files))) values = [] for taggraph_file in taggraph_files: taggraph_info = DataFile.getScanInfoIterator(taggraph_file, delimiter='\t') for item in taggraph_info: values += [{ "scan": item['ScanF'], "charge": item['Charge'], "obs_mh": item['Obs M+H'], "theo_mh": item['Theo M+H'], "ppm": item['PPM'], "retention_time": item['RT'], "alignment_score": item['Alignment Score'], "spectrum_score": item['Spectrum Probability Score'], "composite_score": item['Composite Score'], "context": item['Context'], "mod_context": item['Mod Context'], "mods": item['Match Modifications'], "mod_ranges": item['Mod Ranges'], "mod_ambig_edges": item['Mod Ambig Edges'], "proteins": item['Proteins'], "matching_tag_length": item['Matching Tag Length'], "time_taken": item['Time Taken'], "de_novo_peptide": item['De Novo Peptide'], "unmod_de_novo_peptide": item['Unmod De Novo Peptide'], "de_novo_score": item['De Novo Score'], "num_matches": item['Num Matches'], "fraction_id": fraction_id }] if len(values) > max_batch_size: res = connection.execute(result.insert(), values) values = [] #fraction.results.extend([Result(scan=item['ScanF'], alignment_score=item['Alignment Score'], spectrum_score=item['Spectrum Probability Score'], composite_score=item['Composite Score'], context=item['Context'], mod_context=item['Mod Context'], mods=item['Match Modifications'], mod_ambig_edges=item['Mod Ambig Edges'], proteins=item['Proteins'], matching_tag_length=item['Matching Tag Length'], time_taken=item['Time Taken'], de_novo_peptide=item['De Novo Peptide'], unmod_de_novo_peptide=item['Unmod De Novo Peptide'], de_novo_score=item['De Novo Score'], num_matches=item['Num Matches'])]) if len(values) > 0: res = connection.execute(result.insert(), values) return True
def parseScans(fDict, prog, seqMap, dbDict, delimiter=',', srchID=None, seqDelimLen=2): processedInfo = {} for csvfile in fDict.keys(): cols, data = DataFile.getScanInfo( csvfile, dbDict[prog]['fields'] + (['SrchID'] if srchID != None else []), delimiter=delimiter) processedInfo[fDict[csvfile]] = DataFile.preprocessDatabaseScanInfo( data, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap'], seqDelimLen=seqDelimLen) return processedInfo
def parseInitFile(init, options): A=setupParams() paramsDict = DataFile.parseParams(init) for param in paramsDict['Parameters'].keys(): try: paramType = A[param]['attrs']['type'] val = init['Parameters'][param] if paramType != 'string': val = getattr('__builtin__', paramType)(val) setattr(options, param, val) except KeyError: pass return paramsDict
def get_taxons_at_score_percent_cutoff(get_taxons_file, score_percent_cutoff=0.001): taxons = [] all_pepts = set() cols, data = DataFile.getScanInfo(get_taxons_file, delimiter='\t') for item in data: all_pepts |= eval(item['Peptide Cover']) for item in data: if float(item['Score']) / len(all_pepts) >= score_percent_cutoff: taxons += [item['Taxon']] return taxons
def parseInitFile(init, options): paramsDict = DataFile.parseParams(init) for param in paramsDict['LADS Parameters'].keys(): try: paramType = A[param]['attrs']['type'] val = init['LADS Parameters'][param] if paramType != 'string': val = getattr('__builtin__', paramType)(val) setattr(options, param, val) except KeyError: pass return paramsDict
def getLADSPScore(seq, dtaPath, PNet, ppm=5, ambigEdges=None, ambigAA='X', ambigPenalty=20): pairs = DataFile.getMassIntPairs(dtaPath) precMass = DataFile.getPrecMassAndCharge(dtaPath)[0] epsilon = ppm * precMass * 10 ** -6 spec = PN.Spectrum(PNet, precMass, Nmod=0, Cmod=0, epsilon=epsilon, spectrum=pairs) spec.initializeNoiseModel() nodeGen = Constants.nodeInfoGen(seq, considerTerminalMods=True, ambigEdges=ambigEdges) pScore = 0 node = nodeGen.next() pScore += spec.getNodeScore(**node) pScore += spec.getPriorScore(prm=0, formAA=None, lattAA=node['formAA']) if node['formAA'] == ambigAA: pScore -= ambigPenalty for node in nodeGen: pScore += spec.getNodeScore(**node) if node['formAA'] == ambigAA: pScore -= ambigPenalty pScore += spec.getPriorScore(prm=precMass- Constants.mods['H+'] - Constants.mods['H2O'], formAA=node['lattAA'], lattAA=None) if node['lattAA'] == ambigAA: pScore -= ambigPenalty return pScore
def CreateBPlusTree(filename, Maxkeys): ''' Objective : To create a B+ Tree of records. Input Parameters : filename -> Name of file whose records are used to create B+ Tree. b -> Object of BpluTree Output : None ''' file = open(filename, 'rb') b = BplusTree(Maxkeys) i = 0 while True: try: record = pickle.load(file) key = DataFile.RecordKey(record) b.Insert((key, i)) i += 1 except EOFError: break file.close() return b
compInfo[int(scan['ScanF'])] = scanInfo return compInfo def scatterPlot(compInfo, axis1, axis2): axis1Vals = [] axis2Vals = [] for scanF in compInfo.keys(): if compInfo[scanF][axis1] != 'None' and compInfo[scanF][axis2] != 'None': axis1Vals.extend([float(compInfo[scanF][axis1])]) axis2Vals.extend([float(compInfo[scanF][axis2])]) plt.scatter(axis1Vals, axis2Vals) plt.xlabel(axis1) plt.ylabel(axis2) plt.show() if __name__ == '__main__': scansfName = 'compareSearches_MASCOT_LADSUPen10KPen15All_SEQUEST_ath001862.tdv' scansInfo = DataFile.getScanInfo(scansfName, delimiter='\t') infoMap = {'MASCOT': {'Score': 'Ion Score', 'Peptide': 'Peptide', 'Reference': 'Reference'}, 'SEQUEST': {'Score': 'XCorr', 'Peptide': 'Peptide', 'Reference': 'Reference'}, 'LADS': {'Score': 'PScore', 'Peptide': 'Peptide', 'Reference': 'Reference'}} compInfo = getScanComparisonInfo(scansInfo, infoMap) scatterPlot(compInfo, 'SEQUEST XCorr', 'LADS PScore')
def getCompStats(compSearchPath, mainProgName, progDict, infoMap, paramsDict, mainProgFields=['PScore', 'Num Ambig Edges'], getPairStats=True): compSearchInfo = DataFile.getScanInfo(compSearchPath, delimiter='\t') unpaired = {} other = {} stats = {} for progName, prog in progDict.items(): if progName == mainProgName: continue unpaired[progName] = {'accuracyVec': np.array([]), 'precisionVec': np.array([]), 'numScans': 0} accName, precName = getAccuracyAndPrecisionNames(progName, mainProgName, compSearchInfo[0]) stats[progName] = {} stats[progName]['accName'] = accName stats[progName]['precName'] = precName for progfield in mainProgFields: unpaired[progName][progfield] = np.array([]) other[progName] = copy.deepcopy(unpaired[progName]) pairsDict = {} if getPairStats: truePairs = {} falsePairs = {} compInfo = getScanComparisonInfo(compSearchInfo, infoMap, progDict, scanFields=['Score', 'Peptide', 'Obs M+H'], specificColDict={'LADS': ['Num Ambig Edges', 'Paired Spectrum', 'Pair Configuration']}) for pairConfigName in paramsDict['Pair Configurations']: truePairs[pairConfigName] = {} falsePairs[pairConfigName] = {} pairsDict[pairConfigName] = {} for progName in progDict.keys(): if progName == mainProgName: continue pairsDict[pairConfigName][progName] = findPairsInSearchResults(compInfo, infoMap, progDict, paramsDict['Pair Configurations'][pairConfigName], progName=progName) truePairs[pairConfigName][progName] = copy.deepcopy(unpaired[progName]) falsePairs[pairConfigName][progName] = copy.deepcopy(unpaired[progName]) print 'Compiling stats' for scan in compSearchInfo: scanF1 = int(scan['ScanF']) pairType = determinePairType(pairsDict, scan, progDict, infoMap, mainProgName) if pairType == None: temp = unpaired elif pairType: temp = truePairs[scan[mainProgName + ' Pair Configuration'].lower()] else: temp = falsePairs[scan[mainProgName + ' Pair Configuration'].lower()] for progName in stats.keys(): try: if scan[progName + ' ' + infoMap[progDict[progName]]['Score']] != 'None': temp[progName]['numScans'] += 1 temp[progName]['accuracyVec'] = np.append(temp[progName]['accuracyVec'], float(scan[stats[progName]['accName']])) temp[progName]['precisionVec'] = np.append(temp[progName]['precisionVec'], float(scan[stats[progName]['precName']])) for progfield in mainProgFields: temp[progName][progfield] = np.append(temp[progName][progfield], float(scan[mainProgName + ' ' + progfield])) except ValueError: other[progName]['numScans'] += 1 for progfield in mainProgFields: try: other[progName][progfield] = np.append(other[progName][progfield], float(scan[mainProgName + ' ' + progfield])) except ValueError: print 'ERROR in getting main %s data for scan %s, peptide %s, %s %s' % (mainProgName, scan['ScanF'], scan[mainProgName + ' ' + infoMap[progDict[mainProgName]]['Peptide']], progfield, scan[mainProgName + ' ' + progfield]) pass for progName in stats.keys(): if getPairStats: stats[progName]['truepairs'] = {} stats[progName]['falsepairs'] = {} stats[progName]['pairsDict'] = {} stats[progName]['unpaired'] = unpaired[progName] stats[progName]['other'] = other[progName] stats[progName]['composite'] = {} for pairConfigName in truePairs: stats[progName]['truepairs'][pairConfigName] = truePairs[pairConfigName][progName] stats[progName]['falsepairs'][pairConfigName] = falsePairs[pairConfigName][progName] stats[progName]['pairsDict'][pairConfigName] = pairsDict[pairConfigName][progName] for field in stats[progName]['unpaired']: try: truePairsComp = np.concatenate([stats[progName]['truepairs'][pairConfigName][field] for pairConfigName in stats[progName]['truepairs']]) falsePairsComp = np.concatenate([stats[progName]['falsepairs'][pairConfigName][field] for pairConfigName in stats[progName]['falsepairs']]) stats[progName]['composite'][field] = np.concatenate((truePairsComp, falsePairsComp, stats[progName]['unpaired'][field])) except ValueError: pass numTruePairs = np.sum([stats[progName]['truepairs'][pairConfigName]['numScans'] for pairConfigName in stats[progName]['truepairs']]) numFalsePairs = np.sum([stats[progName]['falsepairs'][pairConfigName]['numScans'] for pairConfigName in stats[progName]['falsepairs']]) stats[progName]['composite']['numScans'] = numTruePairs + numFalsePairs + stats[progName]['unpaired']['numScans'] else: stats[progName]['other'] = other[progName] stats[progName]['composite'] = unpaired[progName] return stats
def getSequencing(pair, sharedPeaks, paramsDict, outFile, res): global print_lock, spectrum_lock result = [] scanData = {} lightSpecs = [DataFile.getMassIntPairs(scanFDict[lightScanF]['dta']) for lightScanF in samePeptideClusters[pair[0]]] heavySpecs = [DataFile.getMassIntPairs(scanFDict[heavyScanF]['dta']) for heavyScanF in samePeptideClusters[pair[1]]] precMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in samePeptideClusters[pair[0]]])) epMean = options.ppmsyserror * precMass * 10**-6 epSTD = options.ppmstd * precMass * 10**-6 scanData['shared peaks ratio'] = sharedPeaks s1 = time.time() sharedInfo, starts, ends, deltas, G = DNS.prepPairedSpectrumGraph(lightSpecs, heavySpecs, precMass, addEnds, ppmSTD=options.ppmstd, Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], verbose=options.verbose) scanData['M+H'] = precMass specs = [] for massIntPairs in lightSpecs: specs += [PN.Spectrum(PNet, precMass, Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)] for massIntPairs in heavySpecs: specs += [PN.Spectrum(PNet, precMass + pairConfig['NMod'] + pairConfig['CMod'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)] for spec in specs: spec.initializeNoiseModel() # with spectrum_lock: temp = DNS.getSpectrumGraphDataThread(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], ambigPenaltyFun, ppmPenaltyFun, hashedAAs, termModHash=termModHash, maxEdge=options.maxedge, minEdge=options.minedge, subGraphCut=options.subgraphcut, subAlpha=0.3, alpha=options.alpha, epMean=epMean, epSTD=epSTD, epStep=epStep, verbose=options.verbose) temp_scan = temp[0] peps = temp[1] scanData.update(temp_scan) scanData['pair configuration'] = pairConfigName with print_lock: print 'Now sequencing light scan(s) %s, heavy scan(s) %s with shared peaks ratio %f \n' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), scanData['shared peaks ratio']) # out.append('Now sequencing light scan(s) ' + str(samePeptideClusters[pair[0]]) + ', heavy scan(s) ' + str(samePeptideClusters[pair[1]]) + ' with shared peaks ratio ' + str(scanData['shared peaks ratio']) + ' \n' ) Ord = np.argsort(-1 * np.array(scanData['over_scores'])) if scanData['blind'] == 0: for i in range(min(Ord.size, 10)): try: print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]]) # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]])) except TypeError: print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + str(peps[1][Ord[i]])) elif scanData['blind'] == 1: for i in range(min(Ord.size, maxNum)): try: print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]][0]), 'Mod Names: ', peps[2][Ord[i]][1] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]][0]) + ' Mod Names: ' + peps[2][Ord[i]][1]) except TypeError: print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]][0], 'Mod Names: ', peps[2][1] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + peps[1][Ord[i]][0] + ' Mod Names: ' + peps[2][1]) scanData['sequencing time'] = time.time() - s1 print '\nTime Taken:', time.time() - s1, '\n' # out.append('\nTime Taken: ' + str(time.time() - s1) + '\n') if validateHeavySequence(scanData['seq'], heavySeqMap, scanData['ambiguous edges']): for scanF in samePeptideClusters[pair[0]] + samePeptideClusters[pair[1]]: scanFDict[scanF]['sequenced'] = True if options.output: for pair in [(lightScanF, heavyScanF) for lightScanF in samePeptideClusters[pair[0]] for heavyScanF in samePeptideClusters[pair[1]]]: scanData['light scan'] = int(pair[0]) scanData['heavy scan'] = int(pair[1]) # outFile.write('\t'.join([str(scanData[col]) for col in cols]) + '\n') # print str(scanData[col]) res.append([str(scanData[col]) for col in cols]) else: print 'WARNING: Invalid sequence! Unsuccessful sequencing of %s and %s with pair configuration %s' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), pairConfigName) exit(0)
def addPlausibleCandidatesFromModList(connection, fracs, expand_mods, data_dir, hashed_single_mods, hashed_mod_combos, prob_network, ep_step=0.01, mod_tolerance=0.1, ppmSTD=10, isobaric_mod_penalty=-0.5, def_mod_penalty=-1, indel_penalty=-3, undef_mod_penalty=-3, spectrum_score_cutoff=0, max_per_scan=10): for fraction_id, fraction_name in fracs: # Load in dta info frac_num = int(fraction_name[1:]) ''' Replace os.path.sep with '/' to fix Windows backslash issues. --smp dta_dir = glob.glob(data_dir + os.path.sep + '*f%02d'%frac_num)[0] + os.path.sep ''' dta_dir = glob.glob(data_dir + '/' + '*f%02d' % frac_num)[0] + '/' dtaList = glob.glob(dta_dir + '*.dta') scanFDict = DataFile.getScanFDict(dtaList) # Get TAG-GRAPH results stmt = select([ results.c.scan, results.c.alignment_score, results.c.context, results.c.mod_context, results.c.mods, results.c.mod_ranges, results.c.mod_ambig_edges, results.c.proteins, results.c.matching_tag_length, results.c.de_novo_peptide, results.c.unmod_de_novo_peptide, results.c.de_novo_score, results.c.num_matches, results.c.obs_mh, results.c.retention_time ]).where(results.c.fraction_id == fraction_id).where( results.c.spectrum_score > spectrum_score_cutoff).order_by( results.c.scan).order_by(desc(results.c.composite_score)) response = connection.execution_options( stream_results=True).execute(stmt) indexed_results = defaultdict(list) for row in SFW.string_folding_wrapper(response): indexed_results[row[0]] += [row[1:]] new_scan_items = {} for scanF in indexed_results: # TODO: Don't add mod candidates for crappy results to save time (use spectrum prob score for this or protease specificity of localization?) # Can also do initial rounds of EM and use a probability cutoff (see above idea for gating mod candidates) # Eval mod lists and mod ranges once (will be using them over and over) mod_lists = [] mod_ranges_list = [] mod_tuples_list = [] enumerated_mods = defaultdict(set) exact_match = False for item in indexed_results[scanF]: mods = eval(item[3]) mod_lists += [mods] mod_ranges = tuple(eval(item[4])) mod_ranges_list += [mod_ranges] if len(mods) == 0 or all( [mod[0][0] == 'Isobaric Substitution' for mod in mods]): exact_match = True break mod_tuples = [] for j, mod in enumerate(mods): mod_tuple = Validator.getUniqueModTuple( [mod], undef_mod_round_precision=2)[0] enumerated_mods[(item[1], mod_ranges[j])].add(mod_tuple) mod_tuples += [mod_tuple] mod_tuples_list += [mod_tuples] #print fraction_name, scanF, exact_match # Don't add mod candidates if exact match is found if exact_match: continue #if scanF != 5841 or fraction_name != 'F12': # continue #print '-----------------------' #print indexed_results[scanF] # Add mod candidates which can plausibly be the sum of two separate mods new_combo_mods = addComboModCandidates( scanF, indexed_results[scanF], mod_lists, mod_ranges_list, mod_tuples_list, enumerated_mods, scanFDict, expand_mods, hashed_mod_combos, prob_network, ep_step, mod_tolerance, ppmSTD) #print 'Num Combo Mods after getUniqueCandidates', sum([len(val[1]) for val in new_combo_mods]) #print '---Combo Mods---' #print new_combo_mods #print enumerated_mods new_single_mods = addSingleModCandidates( scanF, indexed_results[scanF], mod_lists, mod_ranges_list, mod_tuples_list, enumerated_mods, scanFDict, expand_mods, hashed_single_mods, prob_network, ep_step, mod_tolerance, ppmSTD) #print 'Num Single Mods after getUniqueCandidates', sum([len(val[1]) for val in new_single_mods]) #print '---Single Mods---' #print new_single_mods new_scan_items[scanF] = new_single_mods + new_combo_mods #print scanF, new_scan_items[scanF] #print 'Indexed results scans', sorted(indexed_results.keys()) #print 'new_scan_items scans', sorted(new_scan_items.keys()) # Import new candidates into DB values = [] for scanF in indexed_results: #print scanF, scanF in new_scan_items, int(scanF) in new_scan_items, str(scanF) in new_scan_items if scanF in new_scan_items: #print 'NUM', len(new_scan_items[scanF]) indexed_item = indexed_results[scanF][0] i = 0 # Sort candidates by sum of prevalences of mods # No need to sort by composite_score, only top scoring mods for each (context, mod_tuple) pair are included in the new_scan_items (filtered using getUniqueCandidates) for item in sorted(new_scan_items[scanF], key=lambda k: -sum( [expand_mods[mod] for mod in k[0][1]]) / len(k[0][1])): for candidate in item[1]: candidate.update({ "scan": scanF, "charge": scanFDict[scanF]['charge'], "matching_tag_length": indexed_item[7], "time_taken": 0, "de_novo_peptide": indexed_item[8], "unmod_de_novo_peptide": indexed_item[9], "de_novo_score": indexed_item[10], "num_matches": indexed_item[11], "obs_mh": indexed_item[12], "retention_time": indexed_item[13], "ppm": (candidate["theo_mh"] - indexed_item[12]) / candidate["theo_mh"] * 1000000, "fraction_id": fraction_id }) values += [candidate] i += 1 if i > max_per_scan: break print 'Adding %i candidates for fraction %s' % (len(values), fraction_name) res = connection.execute(results.insert(), values) return new_scan_items
def addComboModCandidates(scanF, scan_items, mod_lists, mod_ranges_list, mod_tuples_list, enumerated_mods, scanFDict, expand_mods, hashed_mod_combos, prob_network, ep_step=0.01, mod_tolerance=0.1, ppmSTD=10): add_mods_map = defaultdict(set) # Go through entire candidate list, identify alternate combo mod interpretations for given mod ranges for i, item in enumerate(scan_items): mod_ranges = mod_ranges_list[i] for j, mod in enumerate(mod_lists[i]): # Continue if mod has already been expanded # Format of key in add_mods_map is (context, mod_range) if mod[0][0] == 'Insertion' or mod[0][0] == 'Deletion' or mod[0][ 0] == 'Isobaric Substitution': continue # print j, mod, mod_ranges, mod_ranges[j], item[1] # Initialize set so that this can be skipped if it comes up in the future (and no candidates are found) add_mods_map[(item[1], mod_ranges[j], mod_tuples_list[i][j])] = set() # now hash mass of mods in peptides to see if alternate combo candidates can be found for mod_combo_candidate in hashed_mod_combos[mod_tuples_list[i] [j]]: # ModError is defined as mass of de_novo_seq - mass of modified reference_seq mod_error = (0 if not mod[0][2] else mod[0][2]) - mod_combo_candidate[-1] # make sure that mod_error is within tolerance and no expanded mod for given mod interval has greater prevalence than either mod in mod combo if not (abs(mod_error) > mod_tolerance or any([ expand_mods[enum_mod] > expand_mods[mod_combo_candidate[0]] or expand_mods[enum_mod] > expand_mods[mod_combo_candidate[1]] for enum_mod in enumerated_mods[(item[1], mod_ranges[j])] ])): add_mods_map[(item[1], mod_ranges[j], mod_tuples_list[i][j])].add( (mod_combo_candidate, mod_error)) #print 'Add mods', add_mods_map # Get Sequence candidates for mod ranges which have valid combo mods candidates_map = {} # print add_mods_map.keys() for context, mod_range, mod_tuple in add_mods_map: candidates = [] term = getTerminus(mod_range[0], mod_range[1], context) for mod_combo in add_mods_map[(context, mod_range, mod_tuple)]: mod_1, mod_2 = mod_combo[0][:2] subseq = context[2:-2][mod_range[0]:mod_range[1]] locs_1 = getModLocs(subseq, term, mod_1) locs_2 = getModLocs(subseq, term, mod_2) # TODO: Only add mod combo with most prevalent single mods for a given (context, mod_range) combination (as opposed to all valid mod combos as is done now)? for loc_1 in locs_1[1]: for loc_2 in locs_2[1]: if loc_1 < loc_2: candidates += getComboModSeq(subseq, mod_1, loc_1, mod_2, loc_2, mod_combo[1]) elif loc_1 > loc_2: candidates += getComboModSeq(subseq, mod_2, loc_2, mod_1, loc_1, mod_combo[1]) elif locs_1[0] != locs_2[0] and ( mod_1[0] == 'undefined mass shift' or mod_2[0] == 'undefined mass shift' or mod_1[2][1] != mod_2[2][1]): # Second part of if statement guards against things like putting a trimethyl (A, N-term) with a Carbamyl (N-term, N-term) on the same residue candidates += getComboModSeq(subseq, mod_1, loc_1, mod_2, loc_2, mod_combo[1]) if len(candidates) > 0: candidates_map[(context, mod_range, mod_tuple)] = candidates #print candidates_map # Note: the way this code is written now, this method might produce LOTS of duplicate entries, particularly if the peptide in question is multiply modified # This is because the mod_range which has a valid combo mod is expanded for each scan item (in each scan item, the position of the mod within the mod_range may be different, but it expands out to the same set of new candidates) # In the future (if this way is too slow), we can minimize this time by caching the position, mod for each enumerated candidate, and only expand if the set of all mods (not including the mod_range to expand) is unique # For now, redundant candidates are filtered out at return step new_scan_items = [] if len(candidates_map) > 0 and scanF in scanFDict: precMass = scanFDict[scanF]['precMass'] epSTD = ppmSTD * precMass * 10**-6 spec = PN.Spectrum(prob_network, precMass, Nmod=0.0, Cmod=0.0, epsilon=2 * epSTD, spectrum=DataFile.getMassIntPairs( scanFDict[scanF]['dta']), useMemo=True) spec.initializeNoiseModel() # Generate new entries for scan from peptides for i, item in enumerate(scan_items): for j, mod_range in enumerate(mod_ranges_list[i]): if (item[1], mod_range, mod_tuples_list[i][j]) in candidates_map: new_scan_items += [ getComboModCandidate(spec, scanFDict[scanF]['charge'], item, mod_lists[i], mod_ranges_list[i], j, candidate[0], candidate[1], candidate[2], candidate[3], candidate[4]) for candidate in candidates_map[( item[1], mod_range, mod_tuples_list[i][j])] ] #print 'Num Combo Mods before getUniqueCandidates', len(new_scan_items) return getUniqueCandidates(new_scan_items)
import DataFile from urllib.parse import quote from io import BytesIO from PIL import Image import base64 import subprocess import demjson import time import Mail import Template url_prefix = "http://wap.sogou.com.inner/web/searchList.jsp?keyword=" get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled.txt" word_file = "./word_top" word_list = DataFile.read_file_into_list("./word_top") pic_dir_prefix = "/search/odin/nginx/html/wap/tupu_garbled_pic/pic" report_tmp_path = "mail_detail.html" mail_to = "*****@*****.**" def log_info(str): time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) sys.stdout.write('[%s] [info] %s\n' % (time_str, str)) sys.stdout.flush() def utf8stdout(in_str): utf8stdout = open(1, 'w', encoding='utf-8',
if __name__ == '__main__': print 'This program generates a results file containing Raw lads output postscored with the algorithm of choice. The discmodel is a supplied model, if necessary for the postscoring algorithm' options = ArgLib.parse(['init', 'ppmstd', 'dtadir', 'lads', 'sequest', 'config', 'model', 'output', 'symbolmap'], optArgs=[{'opts': ('-D', '--discmodel'), 'attrs': {'type': 'string', 'dest': 'discmodel', 'help': 'Model used to calculate discriminant score'}}, {'opts': ('-P', '--pairconfig'), 'attrs': {'type': 'string', 'dest': 'pairconfig', 'help': 'Name of LADS Pair Configuration'}}, {'opts': ('-F', '--featurelist'), 'attrs': {'type': 'string', 'dest': 'featurelist', 'help': 'File containing pickled list of desired features (optional)'}}]) parent = os.path.abspath(os.pardir) PNet = PN.ProbNetwork(options.config, options.model) paramsDict = ArgLib.parseInitFile(options.init, options) pairConfigurations = paramsDict['Pair Configurations'] LADSSeqInfo = GLFD.parseSequenceDTAsLogfile(options.lads) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict) seqMap = seqMap['LADS Unit Test'] if options.featurelist: with open(options.featurelist) as fin: desired_feats = pickle.load(fin) else: desired_feats = None heavySeqMaps = {} for confName in pairConfigurations: heavySeqMaps[confName] = copy.deepcopy(seqMap) heavySeqMaps[confName]['Mods']['N-Term'] = pairConfigurations[confName]['NModSymbol'] heavySeqMaps[confName]['Mods']['C-Term'] = pairConfigurations[confName]['CModSymbol'] if options.pairconfig:
binsDict[bin][2] = binsDict[bin][0] - binsDict[bin][1] outFile.write('\n%s Scan Number Difference Distribution. Max Diff: %i' % (name, maxDiff) + '\n') outFile.write('\t'.join(['Diff Bin', 'Test Pairs', 'True Pairs', 'False Pairs']) + '\n') for i in range(numBins): outFile.write('\t'.join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + '\n') if __name__ == '__main__': print 'Model refers to svmmodel used' options = ArgLib.parse(['dtadir', 'combined', 'sequest', 'mascot', 'database', 'output', 'ppmstd', 'init', 'symbolmap']) paramsDict = ArgLib.parseInitFile(options.init, options) progDict = ArgLib.getProgDict(An.searchprogs, options) dbDict = DataFile.getDBInfo(options.database) infoMap = dbDict['infoMap'] with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict) processedInfo = {} if options.mascot: MASCOTdict = eval(options.mascot) processedInfo.update(CS.parseScans(MASCOTdict, 'MASCOT', seqMap, dbDict)) if options.sequest: SEQUESTdict = eval(options.sequest) processedInfo.update(CS.parseScans(SEQUESTdict, 'SEQUEST', seqMap, dbDict))
combMasses = [] for cluster in clusters: if len(cluster) > 1: combMasses += [sum(cluster) / len(cluster)] else: combMasses += cluster return np.sort(np.array(combMasses)) if __name__ == '__main__': dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\' ppm = 5 heavyPath = dirPath + '244.3611.3611.1.dta' lightPath = dirPath + '244.3619.3619.1.dta' heavyPairs = DataFile.getMassIntPairs(heavyPath) lightPairs = DataFile.getMassIntPairs(lightPath) heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath) lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath) print ppm * 10 ** -6 * heavyPrecMass print getSharedPeaksRatio(lightPairs, heavyPairs, Nmod=0, Cmod=Constants.mods['*'], epsilon=ppm * heavyPrecMass * 10 ** -6) """ tPath = dirPath + '244.0855.0855.1.dta' tMass = DataFile.getPrecMassAndCharge(tPath)[0] tPairs = DataFile.getMassIntPairs(tPath) tIons = tPairs[:,0] tIons = np.insert(tIons, 0, 0) tIons = np.append(tIons, tMass)
"ppmpenalty", "ambigpenalty", "minedge", "maxedge", "alpha", "subgraphcut", "symbolmap", ] ) epStep = 0.00025 maxEp = 0.1 paramsDict = ArgLib.parseInitFile(options.init, options) with open(options.symbolmap, "r") as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap({"LADS Unit Test": "LADS"}, symbolMap, paramsDict) if options.columns: with open(options.columns) as fin: cols = pickle.load(fin) else: print "Using default cols" cols = [ "light scan", "heavy scan", "pair configuration", "M+H", "score", "seq", "epsilon", "ambiguous edges",
outFile.write("\t".join(["Diff Bin", "Test Pairs", "True Pairs", "False Pairs"]) + "\n") for i in range(numBins): outFile.write( "\t".join([str(elem) for elem in [bins[i], binsDict[i][0], binsDict[i][1], binsDict[i][2]]]) + "\n" ) if __name__ == "__main__": print "Model refers to svmmodel used" options = ArgLib.parse( ["dtadir", "combined", "sequest", "mascot", "database", "output", "ppmstd", "init", "symbolmap"] ) paramsDict = ArgLib.parseInitFile(options.init, options) progDict = ArgLib.getProgDict(An.searchprogs, options) dbDict = DataFile.getDBInfo(options.database) with open(options.symbolmap, "r") as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict) outFile = open(options.output, "w") print options.dtadir dtaList = glob.glob(options.dtadir + "/*.dta") scanFDict = getScanFDict(dtaList) processedInfo = {} if options.mascot: MASCOTdict = eval(options.mascot) processedInfo.update(CS.parseScans(MASCOTdict, "MASCOT", seqMap, dbDict))
def loadInit(self): self._paramsDict = DataFile.parseParams(self._selectedInitFile.get()) with open('../Misc/symbolmap.txt', 'r') as fin: symbolMap = pickle.load(fin) self._seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, self._paramsDict)['LADS Unit Test'] self._aas = Constants.addPepsToAADict(self._minedge)
import DataFile import Template import Mail import random top_word_loc = "http://$ip/vr_query_period/vr_query_pv.txt" random_word_loc = "http://$ip/vr_query_period/vr_query_random.txt" top_word_file = "./word_top" random_word_file = "./word_random" url_file = "./url_lizhi" url_prefix = "https://wap.sogou.com/web/searchList.jsp?keyword=" #mail_lst = ['*****@*****.**'] mail_lst = DataFile.read_file_into_list("./mail_list") report_tmp_path = "mail_detail.html" def get_word(url, word_file): try: res = requests.get(url) res.encoding = "utf-8" with open(word_file, 'w', encoding='utf8') as f: f.write(res.text) except Exception as err: print('[get_word]: %s' % err) def gen_url(f_in, f_out):
''' mzMLFiles = glob.glob(dataDir + '/' + '*f%02d.mzML'%options.fraction) mzXMLFiles = glob.glob(dataDir + '/' + '*f%02d.mzXML'%options.fraction) if len(mzMLFiles) > 0: ## Found mzML file for this fraction fileFound = True mzMLFile = os.path.abspath(mzMLFiles[0]) ## The directory for DTA files will be the same as the mzML file without the .mzML extension mzml_name_base = mzMLFile[:-5] print 'mzMLFile: "%s"' % (mzMLFile) print 'mzml_name_base: "%s"' % (mzml_name_base) # Unpack DTAs DataFile.executeProcess(SCRIPTS_DIR, 'mzml2dta.py', ['-o', mzml_name_base, mzMLFile]) ''' Replace os.path.sep with '/' to fix Windows backslash issues. --smp dtaDir = glob.glob(dataDir + os.path.sep + '*f%02d'%options.fraction)[0] + os.path.sep ''' localDtaDir = glob.glob(dataDir + '/' + '*f%02d'%options.fraction)[0] + '/' print 'Found mzML, setting dtaDir to %s' % (localDtaDir) elif len(mzXMLFiles) > 0: ## Found mzXML file for this fraction fileFound = True mzXMLFile = os.path.abspath(mzXMLFiles[0]) ''' ## The directory for DTA files will be the same as the mzML file without the .mzML extension mzml_name_base = mzMLFile[:-5] ## print 'mzml_name_base: "%s"' % (mzml_name_base) '''
import time import os, sys from bs4 import BeautifulSoup import DataFile from urllib.parse import quote import time import Mail import Template from ast import literal_eval from itertools import chain MIN_TEXT_LENGTH = 5 url_prefix = "http://wap.sogou.com.inner/web/searchList.jsp?keyword=" get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled_lizhi.txt" word_file = "./word_lizhiqa" word_list = DataFile.read_file_into_list("./word_lizhiqa") report_tmp_path = "mail_detail.html" mail_to = "*****@*****.**" result_file = 'lizhiqa_garbled_result' f_res = open(result_file, 'w', encoding='utf8') class Node(object): def __init__(self, query, vrid): self.query = query self.vrid = vrid self.url = "" self.html = "" self.qa_text = "" self.garble = False self.garble_res = ""
#!/usr/bin/python3 # -*-codig=utf8-*- from pyppeteer import launch import asyncio from urllib.parse import quote import DataFile import datetime import os import time import random wordlist = DataFile.read_file_into_list("./vr_1") async def action_get_page_content(page): content = await page.evaluate('document.documentElement.outerHTML', force_expr=True) return content async def action_is_element_exist(page, selector): #返回文档中与指定选择器或选择器组匹配的第一个 html元素Element。 如果找不到匹配项,则返回null el = await page.querySelector(selector) return el async def action_get_result_loc(page): result_loc_list = await page.evaluate('''() => { //classid黑名单,50000000占位符,50023801/50023901/50024901相关推荐,30010081中部hint,21312001关系图谱,11005401搜狗问问提问 classidBlackArr = ['50000000','50023801','50023901','30010081','21312001','11005401', '50024901'];
if LCs[i] < cutOff: procSeq[i] = ambigAA ambig_edges += [(0, Constants.aminoacids[aa][2])] return ''.join(procSeq), ambig_edges if __name__ == '__main__': print 'In this program, the PEAKS argument is just the location of the PEAKS output to parse. Number argument indicates ALC cutoff to form ambig edges (set to 0 to not form any amibiguous edges' options = ArgLib.parse(['init', 'output', 'symbolmap', 'peaks', 'cutoff']) AMBIG_AA = '@' paramsDict = ArgLib.parseInitFile(options.init, options) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap({'PEAKS': 'PEAKS'}, symbolMap, paramsDict) #print seqMap scanInfo = DataFile.getScanInfo(options.peaks, delimiter=',')[1] if 'Peptide' in scanInfo[0]: seq_col = 'Peptide' else: seq_col = 'Sequence' outFile = open(options.output, 'w') #print 'cutoff_arge', options.cutoff cols = ['ScanF', 'Charge', 'RT', 'Obs M+H', 'Peptide', 'ALC (%)', 'LC'] alc_cutoff = options.cutoff if options.cutoff else 0 if alc_cutoff > 0: cols.insert(-2, 'Ambig Edges')
return False def parseDBScans(fDict, prog, seqMap, dbDict): processedInfo = {} for csvfile in fDict.keys(): MASCOTData = DataFile.getScanInfo(csvfile, dbDict[prog]['fields'], delimiter=',') processedInfo[fDict[csvfile]] = An.preprocessDatabaseScanInfo(MASCOTData, seqMap[fDict[csvfile]], dbDict[prog]['fieldmap']) return processedInfo #Number argument refers to minimum number of search prog results which have the same peptide for it to be included in the final output if __name__== '__main__': options = ArgLib.parse(['init', 'sequest', 'lads', 'mascot', 'output', 'database', 'symbolmap', 'number']) paramsDict = ArgLib.parseInitFile(options.init, options) dbDict = DataFile.getDBInfo(options.database) progDict = ArgLib.getProgDict(An.searchprogs, options) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict) if hasattr(options, 'number'): minNumScans = int(options.number) else: minNumScans = 1 processedInfo = {} if options.lads: LADSdict = eval(options.lads) for tdvfile in LADSdict.keys():
def main(argv): try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: sys.exit(2) for opt, arg in opts: print 'opt: %s' % (opt) print 'arg: %s' % (arg) if opt in ("-h", "--help"): printUsage() sys.exit(1) ''' Now process the arguments (INI file path)''' if len(args) != 1: printUsage() sys.exit(1) configFileName = args[0] ### create a output file/handle: tmpFolder = tempfile.gettempdir() (tm_year, tm_mon, tm_mday, tm_hour, tm_min, tm_sec, tm_wday, tm_yday, tm_isdst) = time.localtime(time.time()) runCapture = tmpFolder + '/RunTG' + str(tm_mon) + str(tm_mday) + str( tm_hour) + str(tm_min) + str(tm_sec) + '.txt' fh = open(runCapture, 'w') write2FileStdout( fh, '**** start TagGraph process: %s' % (datetime.datetime.now())) write2FileStdout(fh, TAGGRAPH_CONFIG_HEADER) write2FileStdout(fh, configFileName + "\n") if os.path.isfile(configFileName) and os.access(configFileName, os.R_OK): write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, "Using Configuration File: %s" % configFileName) write2FileStdout(fh, MSGBORDER) else: #print ' ** FAILURE ** Could not read configuration file: \'%s\'' % (configFileName) write2FileStdout( fh, ' ** FAILURE ** Could not read configuration file: \'%s\'' % (configFileName)) sys.exit(1) theConfig = ConfigParser.ConfigParser() theConfig.optionxform = str theConfig.read(configFileName) #sectionNames = theConfig.sections() generalSectionMap = ConfigSectionMap(theConfig, "General") tagGraphSectionMap = ConfigSectionMap(theConfig, "TagGraph") ###### init', 'dtadir', 'peaks', 'output', 'ppmstd', 'modtolerance', 'unimoddict', 'maxcounts', 'modmaxcounts', 'fmindex', 'model', 'config' ## Define our Required Arguments ## fatalError = False ## Arguments that must exist, and be numbers ## requiredTagGraphNumericArgs = [ 'ppmstd', 'modtolerance', 'maxcounts', 'modmaxcounts' ] ## Arguments that must exist, and be paths that point to files that exist and are Readable ## requiredTagGraphExistingFiles = [ 'unimoddict', 'model', 'config', 'init', 'de_novo' ] ## Arguments that must exist, and be directories that can be created on the filesystem ## requiredTagGraphToCreate = ['output'] ## Special Arguments: # ExperimentName must be a string # d must be a directory, with mzML/mzXML files in it that start with ExperimentName # f must be an fmindex name of the form <basepath>.fm, where <basepath> is the basename and the following files should exist: <basename>.fm.1, <basename>.seqnames.1, <basename>.offsets ## Arguments that must exist, and be numbers ## for currArg in requiredTagGraphNumericArgs: if currArg in tagGraphSectionMap: if isNumeric(tagGraphSectionMap[currArg]): write2FileStdout( fh, '* Found Required Numeric TagGraph Parameter \'%s\' : \'%s\'' % (currArg, tagGraphSectionMap[currArg])) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'%s\' must be a numeric value, found value \'%s\'' % (currArg, tagGraphSectionMap[currArg])) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file' % (currArg)) ## Arguments that must exist, and be paths that point to files that exist and are Readable ## for currArg in requiredTagGraphExistingFiles: if currArg in tagGraphSectionMap: if os.path.isfile(tagGraphSectionMap[currArg]) and os.access( tagGraphSectionMap[currArg], os.R_OK): write2FileStdout( fh, '* Found Required Readable File for TagGraph Parameter \'%s\' : \'%s\'' % (currArg, tagGraphSectionMap[currArg])) else: if not os.path.isfile(tagGraphSectionMap[currArg]): fatalError = True write2FileStdout( fh, '** FAILURE ** Could not find file for Required Parameter \'%s\' at \'%s\'' % (currArg, tagGraphSectionMap[currArg])) elif not os.access(tagGraphSectionMap[currArg], os.R_OK): fatalError = True write2FileStdout( fh, '** FAILURE ** Could not Read file for Required Parameter \'%s\' at \'%s\' (check permissions)' % (currArg, tagGraphSectionMap[currArg])) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file' % (currArg)) ## Arguments that must exist, and be directories that should not already exist but can be created on the filesystem ## for currArg in requiredTagGraphToCreate: if currArg in tagGraphSectionMap: dirToCreate = tagGraphSectionMap[currArg] if not os.path.exists(dirToCreate): try: ## Should be able to make the directory, and then remove it ## os.makedirs(dirToCreate) os.rmdir(dirToCreate) write2FileStdout( fh, '* Found Required Createable Directory for TagGraph Parameter \'%s\' : \'%s\'' % (currArg, dirToCreate)) except OSError: fatalError = True write2FileStdout( fh, '** FAILURE ** Unable to Create Directory for Required Parameter \'%s\' at \'%s\'' % (currArg, dirToCreate)) else: fatalError = True write2FileStdout( fh, '** FAILURE ** File/Directory for Required Parameter \'%s\' at \'%s\' already exists! Should be created by TagGraph' % (currArg, dirToCreate)) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'%s\' not found in config file' % (currArg)) ## Now Lets Handle the Special Cases ## ExperimentName must be a string experimentName = '' if not 'ExperimentName' in tagGraphSectionMap: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'ExperimentName\' not found in config file' ) else: experimentName = tagGraphSectionMap['ExperimentName'] write2FileStdout( fh, '* Found Required TagGraph Parameter ExperimentName: \'%s\'' % (experimentName)) ## New Method: numFractions = 2, fraction01 = <path to file 1>, fraction02 = <path to file 2> numFractions = 0 foundNumFractions = False dataDirectory = '' symLinkDir = symLinkBaseDir if not 'numFractions' in tagGraphSectionMap: ## Check for dataDirectory and automatically finding data files from the de novo files if not 'dataDirectory' in tagGraphSectionMap: fatalError = True write2FileStdout( fh, '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' not found in config file' ) else: dataDirectory = tagGraphSectionMap['dataDirectory'] if not dataDirectory.endswith('/'): dataDirectory += '/' if (not (dataDirectory.startswith('/'))): levelup = dataDirectory.count('../') if (levelup == 0): dataDirectory = CUR_DIR + '/' + dataDirectory else: splitDataDir = dataDirectory.split("/") splitCurDir = CUR_DIR.split("/") tmpD = '' for i in xrange(0, len(splitCurDir) - levelup): tmpD = tmpD + splitCurDir[i] + "/" for i in xrange(levelup, len(splitDataDir) - 1): tmpD = tmpD + splitDataDir[i] + "/" dataDirectory = tmpD write2FileStdout(fh, "dataDirectory: %s" % dataDirectory) if not os.path.exists(dataDirectory): fatalError = True write2FileStdout( fh, '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' does not exist at: \'%s\'' % (dataDirectory)) elif not os.path.isdir(dataDirectory): fatalError = True write2FileStdout( fh, '** FAILURE ** Required Directory TagGraph Parameter \'dataDirectory\' does not point to a directory at: \'%s\'' % (dataDirectory)) else: ## We need to get the data file names from the de novo file, and check for them in the dataDirectory fileFractionMapping = [] deNovoFile = tagGraphSectionMap['de_novo'] if deNovoFile.upper().endswith('.XML') or deNovoFile.upper( ).endswith('.PEPXML') or deNovoFile.upper().endswith('.CSV'): if deNovoFile.upper().endswith( '.XML') or deNovoFile.upper().endswith('.PEPXML'): fileFractionMapping = pepInput.getFileFractionMappingFromPepXML( deNovoFile) else: ## deNovoFile.upper().endswith('.CSV'): fileFractionMapping = pepInput.getFileFractionMappingFromCSV( deNovoFile) ## We should now have fileMapping, a list of tuples: (2-Digit Fraction Num, FileName) ## mz[X]ML Files should be located in the dataDirectory write2FileStdout( fh, 'fileFractionMapping: %s' % fileFractionMapping) symLinkDir += experimentName + '_' + str(os.getpid()) + '/' dataFileSuffix = "mzML" try: ## Should be able to make the directory, and then remove it ## os.makedirs(symLinkDir) write2FileStdout( fh, '* Created temporary sym-link Directory for TagGraph mz[X]ML files \'%s\'' % (symLinkDir)) ## Lets write out the fileFractionMapping, pickled for easy reading/writing mappingFilename = 'fileFractionMapping.pck' mappingFilePath = os.path.join(symLinkDir, mappingFilename) mappingOutput = open(mappingFilePath, 'wb') pickle.dump(fileFractionMapping, mappingOutput) mappingOutput.close() ##Create a symbolic link pointing to source named link_name. for currFilledFractionNumber, currFilename in fileFractionMapping: ## Check if source file exists currFilePath = dataDirectory + currFilename if not os.path.exists(currFilePath): fatalError = True write2FileStdout( fh, '** FAILURE ** Data File \'%s\' referenced in de novo file does not exist in dataDirectory \'%s\'' % (currFilename, dataDirectory)) elif not os.access(currFilePath, os.R_OK): fatalError = True write2FileStdout( fh, '** FAILURE ** Data file \'%s\' Not Readable' % (currFilePath)) else: currFractionFile = currFilePath if currFractionFile.endswith('mzML'): dataFileSuffix = 'mzML' elif currFractionFile.endswith('mzXML'): dataFileSuffix = 'mzXML' else: fatalError = True dataFileSuffix = '' write2FileStdout( fh, '** FAILURE ** Data file \'%s\' must end in .mzML or .mzXML!' % (currFractionFile)) if not dataFileSuffix == '': symLinkFile = symLinkDir + experimentName + '_f' + currFilledFractionNumber + '.' + dataFileSuffix os.symlink(currFractionFile, symLinkFile) write2FileStdout( fh, ' * Created symLink \'%s\' to data file \'%s\'' % (symLinkFile, currFractionFile)) except OSError: fatalError = True write2FileStdout( fh, '** FAILURE ** Unable to Create Directory for TagGraph mz[X]ML sym-links at \'%s\'' % (symLinkDir)) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required de novo TagGraph Parameter \'de_novo\' must be named .CSV or .XML/.PEPXML, found \'%s\'' % (deNovoFile)) else: numFractions = tagGraphSectionMap['numFractions'] if isNumeric(numFractions): if float(numFractions).is_integer(): foundNumFractions = True write2FileStdout( fh, '* Found Required integer TagGraph Parameter \'numFractions\' : \'%s\'' % (numFractions)) numFractions = int(numFractions) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'numFractions\' must be an integer value, found value \'%s\'' % (numFractions)) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'numFractions\' must be a numeric value, found value \'%s\'' % (numFractions)) ## If we found numFractions, lets get the paths to the data files and make sym-links to them in a new directory ## ## sym-links will be named <ExperimentName>_f01.mz[X]ML, etc. ## if True == foundNumFractions: symLinkDir += experimentName + '_' + str(os.getpid()) + '/' dataFileSuffix = "mzML" try: ## Should be able to make the directory, and then remove it ## os.makedirs(symLinkDir) write2FileStdout( fh, '* Created temporary sym-link Directory for TagGraph mz[X]ML files \'%s\'' % (symLinkDir)) except OSError: fatalError = True write2FileStdout( fh, '** FAILURE ** Unable to Create Directory for TagGraph mz[X]ML sym-links at \'%s\'' % (symLinkDir)) ##Create a symbolic link pointing to source named link_name. for currFraction in xrange(1, numFractions + 1): filledFractionNumber = str(currFraction).zfill(2) if not str('fraction' + filledFractionNumber) in tagGraphSectionMap: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'fraction%s\' not found in config file' % (filledFractionNumber)) currFractionFile = tagGraphSectionMap['fraction' + filledFractionNumber] if currFractionFile.endswith('mzML'): dataFileSuffix = 'mzML' elif currFractionFile.endswith('mzXML'): dataFileSuffix = 'mzXML' else: fatalError = True write2FileStdout( fh, '** FAILURE ** Data file \'%s\' must end in mzML or mzXML!' % (currFractionFile)) symLinkFile = symLinkDir + experimentName + '_f' + filledFractionNumber + '.' + dataFileSuffix os.symlink(currFractionFile, symLinkFile) write2FileStdout( fh, ' * Created symLink \'%s\' to data file \'%s\'' % (symLinkFile, currFractionFile)) # f must be an fmindex name of the form <basepath>.fm, where <basepath> is the full file path without the .fm extension, and the following files should exist: <basename>.fm.1, <basename>.seqnames.1, <basename>.offsets fmindexBase = '' if not 'fmindex' in tagGraphSectionMap: fatalError = True write2FileStdout( fh, '** FAILURE ** Required TagGraph Parameter \'fmindex\' (should be the basename of the fmindex files, ending in \'.fm\') not found in config file' ) else: fmParam = tagGraphSectionMap['fmindex'] write2FileStdout( fh, '* Found Required fmindex TagGraph Parameter \'%s\'' % (fmParam)) if fmParam.endswith('.fm'): fmindexBase = fmParam[:-3] else: fmindexBase = fmParam # Now lets check for 3 fmIndex files ending in: .fm.1, .offsets, and .seqnames.1 fmFile = fmindexBase + ".fm.1" fmOffsetFile = fmindexBase + ".offsets" fmSeqnamesFile = fmindexBase + ".seqnames.1" if not os.path.isfile(fmFile): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not find required fmindex file at \'%s\'' % (fmFile)) elif not os.access(fmFile, os.R_OK): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not Read required fmindex file \'%s\' (check permissions)' % (fmFile)) else: write2FileStdout( fh, ' * Found Required readable fmindex file at \'%s\'' % (fmFile)) if not os.path.isfile(fmOffsetFile): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not find required fmindex Offset file at \'%s\'' % (fmOffsetFile)) elif not os.access(fmOffsetFile, os.R_OK): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not Read required fmindex Offset file \'%s\' (check permissions)' % (fmOffsetFile)) else: write2FileStdout( fh, ' * Found Required readable fmindex Offset file at \'%s\'' % (fmOffsetFile)) if not os.path.isfile(fmSeqnamesFile): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not find required fmindex Seqnames file at \'%s\'' % (fmSeqnamesFile)) elif not os.access(fmSeqnamesFile, os.R_OK): fatalError = True write2FileStdout( fh, ' ** FAILURE ** Could not Read required fmindex Seqnames file \'%s\' (check permissions)' % (fmSeqnamesFile)) else: write2FileStdout( fh, ' * Found Required readable fmindex Seqnames file at \'%s\'' % (fmSeqnamesFile)) ### Now lets Check the EM step parameters that can be checked before TG runs ### expectationMaximizationSectionMap = ConfigSectionMap(theConfig, "EM") ''' -i: same as TG -i parameter -F all -M 100 -C 20 -B = <-o parameter from TG>/results.db [checked after TG] -E: Same as TG ExperimentName parameter. -o: Output Prefix, will create files with the prefix <EM -o parameter> in the directory specified by the <TG -o parameter> ''' ## Arguments that must exist, and be numbers # Special Case: EMFractions must be 'all' or a number. Note: EMFractions is now assumed to always be 'all' requiredEMNumericArgs = ['maxIterations', 'initIterations'] #,'EMFractions'] ## Special Arguments: ## -o must be a string, the file prefix for the EM Output files (often 'EM_Results') ## Arguments that must exist, and be numbers ('EMFractions' is special, as a number or 'all') for currArg in requiredEMNumericArgs: if currArg in expectationMaximizationSectionMap: if isNumeric(expectationMaximizationSectionMap[currArg]): write2FileStdout( fh, '* Found Required EM Numeric Parameter \'%s\' : \'%s\'' % (currArg, expectationMaximizationSectionMap[currArg])) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required EM Parameter \'%s\' must be a numeric value, found value \'%s\'' % (currArg, expectationMaximizationSectionMap[currArg])) else: fatalError = True write2FileStdout( fh, '** FAILURE ** Required EM Parameter \'%s\' not found in config file' % (currArg)) ## Now Lets Handle the Special Cases # resultsPrefix (Output Prefix) must be a string emResultsPrefix = '' if not 'resultsPrefix' in expectationMaximizationSectionMap: fatalError = True write2FileStdout( fh, '** FAILURE ** Required EM Parameter \'resultsPrefix\' not found in config file' ) else: emResultsPrefix = expectationMaximizationSectionMap['resultsPrefix'] write2FileStdout( fh, '* Found Required EM Parameter \'resultsPrefix\': \'%s\'' % (emResultsPrefix)) #options = ArgLib.parse(['init', 'dtadir', 'peaks', 'output', 'ppmstd', 'modtolerance', 'unimoddict', 'maxcounts', 'modmaxcounts', 'fmindex', 'model', 'config'], optArgs=[{'opts': ('-x', '--splittaxon'), 'attrs': {'dest': 'splittaxon', 'action': 'store_true', 'default': False, 'help': 'Flag. For searches of metaproteomic databases, split identical context entries by taxon for accurate consideration via EM.'}}]) ### If a fatal error was thrown, do not proceed ### if fatalError == True: write2FileStdout( fh, '***** HALTING DUE TO FATAL ERROR IN TAGGRAPH OR EM PARAMETERS, SEE OUTPUT ABOVE!!! ' ) sys.exit(1) ## Lets set up the args properly for RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py ## tg_ppmstd = str(tagGraphSectionMap['ppmstd']) tg_modtolerance = str(tagGraphSectionMap['modtolerance']) tg_maxcounts = str(tagGraphSectionMap['maxcounts']) tg_modmaxcounts = str(tagGraphSectionMap['modmaxcounts']) tg_config = tagGraphSectionMap['config'] tg_init = tagGraphSectionMap['init'] tg_dtadir = symLinkDir ## tagGraphSectionMap['d'] tg_model = tagGraphSectionMap['model'] tg_output = tagGraphSectionMap['output'] tg_unimoddict = tagGraphSectionMap['unimoddict'] tg_fmindex = tagGraphSectionMap['fmindex'] tg_peaks = '{\'' + tagGraphSectionMap[ 'ExperimentName'] + '\': \'' + tagGraphSectionMap[ 'de_novo'] + '\'}' # K = "{'e009133': '/lab/samba/shared/Users/Sam/20160630_Pulldown_dcas9_in_gel_digest_test_DENOVO_5/de_novo_peptides.csv'}" ### tg_output directory will now end with a slash if not tg_output.endswith('/'): tg_output += '/' tgArgs = [] tgArgs.extend(['-p', '\"' + tg_ppmstd + '\"']) tgArgs.extend(['-l', '\"' + tg_modtolerance + '\"']) tgArgs.extend(['-M', '\"' + tg_maxcounts + '\"']) tgArgs.extend(['-C', '\"' + tg_modmaxcounts + '\"']) tgArgs.extend(['-c', '\"' + tg_config + '\"']) tgArgs.extend(['-i', '\"' + tg_init + '\"']) tgArgs.extend(['-d', '\"' + tg_dtadir + '\"']) tgArgs.extend(['-m', '\"' + tg_model + '\"']) tgArgs.extend(['-o', '\"' + tg_output + '\"']) tgArgs.extend(['-Q', '\"' + tg_unimoddict + '\"']) tgArgs.extend(['-f', '\"' + tg_fmindex + '\"']) tgArgs.extend(['-K', '\"' + tg_peaks + '\"']) write2FileStdout(fh, '\nTG ARGS: %s\n\n' % tgArgs) write2FileStdout(fh, MSGBORDER) write2FileStdout( fh, '*** CALLING RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py from runTG.py') write2FileStdout(fh, MSGBORDER + "\n") DataFile.executeProcess(SCRIPTS_DIR, 'RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py', tgArgs) write2FileStdout(fh, "\n" + MSGBORDER) write2FileStdout( fh, '*** END CALLING RUN_TAGGRAPH_HUMAN_PROTEOME_EASY.py from runTG.py') write2FileStdout(fh, MSGBORDER) ### VERIFY TG RUN ### ''' Now lets check the TG output to make sure it ran correctly. We'll check for: * <output_dir>/results.db should exist and have size > 0 (do actual db check?) * The files <output_dir>/<experiment_name>_addPlausibleMods_poss_[combo/single]_mods.tdv both exist and have reasonable sizes * Check that output_dir/<experiment_name>/data/ contains directories of DTA files named <experiment_name>_f01/ etc * Check that output_dir/<experiment_name>/de_novo/<experiment_name>_PEAKS.csv/PEAKS_parsed.tdv/PEAKS_parsed_F1.tdv etc exist * Check that output_dir/<experiment_name>/taggraph/<experiment_name>_PEAKS_parsed_F1_TAGGRAPH.tdv etc exist * output_dir/<experiment_name>_CHECK.txt.<numFractions> contains count numbers for each fraction: ------------------------------- /lab/samba/shared/Users/Sam/newtest/diet60_output Experiment Name diet60 ID 1 Result Counts for 4 fractions F1: 399878 F2: 395964 F3: 346932 F4: 270693 ------------------------------- ''' write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, '*** VERIFYING TAGGRAPH OUTPUTS in runTG.py ') write2FileStdout(fh, MSGBORDER) minDBFileSize = 1000000 ## 1Megabyte minimum db size after TG runs? minAddPlausibleModsFileSize = 2000 ## 10kBytes min size for <experiment_name>_addPlausibleMods_[combo/single]_mods.tdv files ## <output_dir>/results.db should exist and have size > 0 (do actual db check?) dbFile = tg_output + 'results.db' if not os.path.exists(dbFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Required SQLITE DB File \'%s\' does not exist!!' % (dbFile)) else: dbFileSize = os.path.getsize(dbFile) if dbFileSize < minDBFileSize: fatalError = True write2FileStdout( fh, '** FAILURE ** Required SQLITE DB File \'%s\' is too small: %d Bytes!!' % (dbFile, dbFileSize)) else: write2FileStdout( fh, '* Found Required SQLITE DB File \'%s\', size %d Bytes OK' % (dbFile, dbFileSize)) ## The files <output_dir>/<experiment_name>_addPlausibleMods_poss_[combo/single]_mods.tdv both exist singleModsFile = tg_output + experimentName + '_addPlausibleMods_poss_single_mods.tdv' comboModsFile = tg_output + experimentName + '_addPlausibleMods_poss_combo_mods.tdv' if not os.path.exists(singleModsFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Required Single Mods File \'%s\' does not exist!!' % (singleModsFile)) else: singleModsFileSize = os.path.getsize(singleModsFile) if singleModsFileSize < minAddPlausibleModsFileSize: fatalError = True write2FileStdout( fh, '** FAILURE ** Required Single Mods File \'%s\' is too small: %d Bytes!!' % (singleModsFile, singleModsFileSize)) else: write2FileStdout( fh, '* Found Required Single Mods File \'%s\', size %d Bytes OK' % (singleModsFile, singleModsFileSize)) if not os.path.exists(comboModsFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Required Combo Mods File \'%s\' does not exist!!' % (comboModsFile)) else: comboModsFileSize = os.path.getsize(comboModsFile) if comboModsFileSize < minAddPlausibleModsFileSize: fatalError = True write2FileStdout( fh, '** FAILURE ** Required Combo Mods File \'%s\' is too small: %d Bytes!!' % (comboModsFile, comboModsFileSize)) else: write2FileStdout( fh, '* Found Required Combo Mods File \'%s\', size %d Bytes OK' % (comboModsFile, comboModsFileSize)) ## Check that output_dir/<experiment_name>/data/ contains directories of DTA files named <experiment_name>_f01/ etc dataDir = tg_output + experimentName + '/data/' for currFraction in xrange(1, numFractions + 1): filledFractionNumber = str(currFraction).zfill(2) currDtaDirName = dataDir + experimentName + '_f' + filledFractionNumber if not os.path.exists(currDtaDirName): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing directory of DTA files at: \'%s\'' % (currDtaDirName)) elif not os.path.isdir(currDtaDirName): fatalError = True write2FileStdout( fh, '** FAILURE ** \'%s\' exists but is not a Directory!' % (currDtaDirName)) else: write2FileStdout( fh, '* Found DTA directory: \'%s\'' % (currDtaDirName)) ## Check that output_dir/<experiment_name>/de_novo/<experiment_name>_PEAKS.csv/PEAKS_parsed.tdv/PEAKS_parsed_F1.tdv etc exist deNovoDir = tg_output + experimentName + '/de_novo/' deNovoCSV = deNovoDir + experimentName + '_PEAKS.csv' peaksParsed = deNovoDir + experimentName + '_PEAKS_parsed.tdv' fractionsParsedBase = deNovoDir + experimentName + '_PEAKS_parsed_F' if not os.path.exists(deNovoCSV): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing de novo CSV File \'%s\' !!' % (deNovoCSV)) else: write2FileStdout( fh, '* Found Required de novo CSV File \'%s\'' % (deNovoCSV)) if not os.path.exists(peaksParsed): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing Parsed de novo File \'%s\' !!' % (peaksParsed)) else: write2FileStdout( fh, '* Found Required Parsed de novo File \'%s\'' % (peaksParsed)) for currFraction in xrange(1, numFractions + 1): currParsedFractionFile = fractionsParsedBase + str( currFraction) + '.tdv' if not os.path.exists(currParsedFractionFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing Parsed de novo Fraction File \'%s\' !!' % (currParsedFractionFile)) else: write2FileStdout( fh, '* Found Required Parsed de novo Fraction File \'%s\'' % (currParsedFractionFile)) ## Check that output_dir/<experiment_name>/taggraph/<experiment_name>_PEAKS_parsed_F1_TAGGRAPH.tdv etc exist taggraphDir = tg_output + experimentName + '/taggraph/' taggraphParsedBase = taggraphDir + experimentName + '_PEAKS_parsed_F' taggraphParsedSuffix = '_TAGGRAPH.tdv' for currFraction in xrange(1, numFractions + 1): currTaggraphFractionFile = taggraphParsedBase + str( currFraction) + taggraphParsedSuffix if not os.path.exists(currTaggraphFractionFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing Parsed TagGraph Fraction File \'%s\' !!' % (currTaggraphFractionFile)) else: write2FileStdout( fh, '* Found Required Parsed TagGraph Fraction File \'%s\'' % (currTaggraphFractionFile)) write2FileStdout(fh, "\n" + MSGBORDER) write2FileStdout(fh, '*** END VERIFYING TAGGRAPH OUTPUTS in runTG.py') write2FileStdout(fh, MSGBORDER) ### END VERIFY TG RUN ### ### If a fatal error was thrown, do not proceed ### if fatalError == True: write2FileStdout( fh, '***** HALTING DUE TO FATAL ERROR IN VERIFYING TAGGRAPH RUN, SEE OUTPUT ABOVE!!' ) sys.exit(1) ## Copy configuration file to output tree for safe keeping ## configFileBaseName = os.path.basename(configFileName) checkConfigDestination = tg_output if os.path.exists(checkConfigDestination + configFileBaseName): write2FileStdout( fh, '** WARNING ** config file \'%s\' already exists in output directory \'%s\'' % (configFileBaseName, checkConfigDestination)) else: shutil.copy(configFileName, checkConfigDestination) write2FileStdout( fh, '* Successfully copied Configuration File \'%s\' to Output Directory \'%s\'' % (configFileName, checkConfigDestination)) ## Lets set up the args properly for ComputeEMProbabilitiesFromDB.py ## ''' -i: same as TG -i parameter -F all -M 100 -C 20 -B = <-o parameter from TG>/results.db [checked after TG runs] -E: Same as TG ExperimentName parameter. -o: Output Prefix, will create files with the prefix <EM -o parameter> in the directory specified by the <TG -o parameter> ''' em_init = tg_init em_fractions = 'all' ## EMFractions is always 'all' now! ## = str(expectationMaximizationSectionMap['EMFractions']) em_maxIterations = str(expectationMaximizationSectionMap['maxIterations']) em_initIterations = str( expectationMaximizationSectionMap['initIterations']) em_dbLocation = tg_output + 'results.db' em_experimentName = tagGraphSectionMap['ExperimentName'] em_output = tg_output if not em_output.endswith('/'): em_output += '/' em_output += emResultsPrefix emArgs = [] emArgs.extend(['-i', '\"' + em_init + '\"']) emArgs.extend(['-F', '\"' + em_fractions + '\"']) emArgs.extend(['-M', '\"' + em_maxIterations + '\"']) emArgs.extend(['-C', '\"' + em_initIterations + '\"']) emArgs.extend(['-B', '\"' + em_dbLocation + '\"']) emArgs.extend(['-E', '\"' + em_experimentName + '\"']) emArgs.extend(['-o', '\"' + em_output + '\"']) write2FileStdout(fh, 'EM ARGS: %s\n' % emArgs) write2FileStdout(fh, MSGBORDER) write2FileStdout( fh, '*** CALLING ComputeEMProbabilitiesFromDB.py from runTG.py') write2FileStdout(fh, MSGBORDER + "\n") DataFile.executeProcess(SCRIPTS_DIR, 'ComputeEMProbabilitiesFromDB.py', emArgs) write2FileStdout( fh, '*** command executed: python ComputeEMProbabilitiesFromDB.py %s' % emArgs) write2FileStdout(fh, "\n" + MSGBORDER) write2FileStdout( fh, '*** END CALLING ComputeEMProbabilitiesFromDB.py from runTG.py') write2FileStdout(fh, MSGBORDER) EMProbs_TOPONLY = tg_output + 'EM_Results_EMProbs_END_TOPONLY.tdv' if not os.path.exists(EMProbs_TOPONLY): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing EMProbs END TOPONLY file \'%s\'.' % (EMProbs_TOPONLY)) sys.exit(1) else: write2FileStdout( fh, '* Found EMProbs END TOPONLY file \'%s\'' % (EMProbs_TOPONLY)) write2FileStdout(fh, "\n\n" + MSGBORDER) write2FileStdout(fh, '*** CALLING verify EM result tests from runTG.py') write2FileStdout(fh, "\ntime now: @ %s" % datetime.datetime.now()) result = verifyEM.verifyEM(tg_output) write2FileStdout(fh, result) write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, "\ntime now: @ %s" % datetime.datetime.now()) write2FileStdout(fh, '*** END CALLING verify EM result tests from runTG.py') write2FileStdout(fh, MSGBORDER) topResultsFile = tg_output + experimentName + '_TopResults.tdv' if not os.path.exists(topResultsFile): fatalError = True write2FileStdout( fh, '** FAILURE ** Missing TopResult file \'%s\'.' % (topResultsFile)) sys.exit(1) else: write2FileStdout(fh, '* Found TopResult file \'%s\'' % (topResultsFile)) outputPerFraction = "No" write2FileStdout( fh, '**** start parseResultsDB process: %s' % (datetime.datetime.now())) FDRCutoff = 0.01 logEMCutoff = 100 DisplayProteinNum = 5 if "outputPerFraction" in generalSectionMap: if True == theConfig.getboolean('General', 'outputPerFraction'): outputPerFraction = "Yes" if "FDRCutoff" in generalSectionMap: if isNumeric(generalSectionMap["FDRCutoff"]): write2FileStdout( fh, '* Found Numeric TagGraph Parameter \'%s\' : \'%s\'' % ("FDRCutoff", generalSectionMap["FDRCutoff"])) FDRCutoff = generalSectionMap['FDRCutoff'] if "logEMCutoff" in generalSectionMap: if isNumeric(generalSectionMap["logEMCutoff"]): write2FileStdout( fh, '* Found Numeric TagGraph Parameter \'%s\' : \'%s\'' % ("logEMCutoff", generalSectionMap["logEMCutoff"])) logEMCutoff = generalSectionMap['logEMCutoff'] if "DisplayProteinNum" in generalSectionMap: if isNumeric(generalSectionMap["DisplayProteinNum"]): write2FileStdout( fh, '* Found Numeric TagGraph Parameter \'%s\' : \'%s\'' % ("DisplayProteinNum", generalSectionMap["DisplayProteinNum"])) DisplayProteinNum = generalSectionMap['DisplayProteinNum'] writeTopArgs = [] writeTopArgs.extend(['\"' + tg_output + '\"']) writeTopArgs.extend(['\"' + tg_init + '\"']) writeTopArgs.extend(['\"' + outputPerFraction + '\"']) writeTopArgs.extend(['\"' + str(FDRCutoff) + '\"']) writeTopArgs.extend(['\"' + str(logEMCutoff) + '\"']) writeTopArgs.extend(['\"' + str(DisplayProteinNum) + '\"']) ## Now lets parse the original TG tab-delimted format ## write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, '*** CALLING parseResultsDB.py from runTG.py') write2FileStdout(fh, MSGBORDER + "\n") DataFile.executeProcess(SCRIPTS_DIR, 'parseResultsDB.py', writeTopArgs) write2FileStdout( fh, '*** command executed: python parseResultsDB.py %s' % writeTopArgs) write2FileStdout(fh, "\n" + MSGBORDER) write2FileStdout(fh, '*** END CALLING parseResultsDB.py from runTG.py') write2FileStdout( fh, '**** done parseResultsDB process: %s' % (datetime.datetime.now())) write2FileStdout(fh, MSGBORDER) topResultsFinalFile = tg_output + experimentName + '_TopResults*.txt' foundFile = 0 if len(glob.glob(topResultsFinalFile)) > 0: foundFile = 1 if foundFile == 0: fatalError = True write2FileStdout( fh, '** FAILURE ** Missing result file \'%s\' from parseResultsDB.py process. Please check.' % (topResultsFinalFile)) sys.exit(1) if 'generatePepXML' in generalSectionMap: if True == theConfig.getboolean('General', 'generatePepXML'): ## Now lets generate the output in PepXML format ## ''' python -u /lab/scratch/taggraph_sarah/taggraphsourcecode/database/resultPepXML.py \ tg_init-i /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/TAG_GRAPH_Tryp_CysCarbam_MetOx.ini \ tg_ppmstd-p 10 \ tg_modtolerance-l 0.1 \ tg_maxcounts-M 400 \ tg_modmaxcounts-C 200 \ tg_fmindex-f /var/www/html/TAG_GRAPH/lib/databases/20141209_UniHUMAN_cRAP_ILEq.fm \ tg_model-m /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/AllChargeDist_posOnlyDependence_20150808_HumanProt500000.pck \ xxxx-c /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/AllChargeDist_posOnlyDependence_20150808.txt \ tg_unimoddict-Q /lab/scratch/taggraph_sarah/taggraphsourcecode/resources/unimodDict_noLabels.pck \ tg_output-o /lab/samba/shared/Users/Sarah/taggraph/testmzml/output/ \ tg_dtadir-d /lab/samba/shared/Users/Sarah/taggraph/testmzml \ >& /lab/samba/shared/Users/Sarah/taggraph/testmzml/OutputpepXML.txt ''' pepArgs = [] pepArgs.extend(['\"' + tg_init + '\"']) pepArgs.extend(['\"' + tg_ppmstd + '\"']) pepArgs.extend(['\"' + tg_modtolerance + '\"']) pepArgs.extend(['\"' + tg_maxcounts + '\"']) pepArgs.extend(['\"' + tg_modmaxcounts + '\"']) pepArgs.extend(['\"' + tg_fmindex + '\"' ]) # tagGraphSectionMap['fmindex'] pepArgs.extend(['\"' + tg_model + '\"' ]) # tagGraphSectionMap['model'] #pepArgs.extend(['\"' + tg_config + '\"']) # tagGraphSectionMap['config'] pepArgs.extend(['\"' + tg_unimoddict + '\"' ]) # tagGraphSectionMap['unimoddict'] pepArgs.extend(['\"' + tg_output + '\"' ]) # tagGraphSectionMap['output'] pepArgs.extend(['\"' + tg_dtadir + '\"']) # symLinkDir pepArgs.extend(['\"' + str(FDRCutoff) + '\"']) pepArgs.extend(['\"' + str(logEMCutoff) + '\"']) write2FileStdout(fh, MSGBORDER) write2FileStdout( fh, '*** CALLING generatePepXMLDBperFrac.py from runTG.py') write2FileStdout(fh, MSGBORDER + "\n") DataFile.executeProcess(SCRIPTS_DIR, 'generatePepXMLDBperFrac.py', pepArgs) write2FileStdout( fh, '*** command: python generatePepXMLDBperFrac.py %s' % pepArgs) write2FileStdout(fh, "\n" + MSGBORDER) write2FileStdout( fh, '*** END CALLING generatePepXMLDBperFrac.py from runTG.py') write2FileStdout(fh, MSGBORDER) ''' Now lets clean up our temporary items and copied data files as configured! ### We need to: * Remove the sym-link directory in /tmp/ (symLinkDir) * If cleanMzDataFilesFromOutput is True, clean the dataDir (<output_dir>/<experiment_name>/data/) directory of mz[X]ML files and the DTA directories of the same name ''' write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, '*** CLEANING UP') write2FileStdout(fh, MSGBORDER) ### Remove the sym-link directory in /tmp/ (symLinkDir) shutil.rmtree(symLinkDir) if os.path.exists(symLinkDir): write2FileStdout( fh, '** FAILURE ** Failed to removed temporary symbolic link directory \'%s\'' % (symLinkDir)) else: write2FileStdout( fh, '* Successfully removed temporary symbolic link directory \'%s\'' % (symLinkDir)) if 'cleanInputDataFilesFromOutput' in generalSectionMap: if True == theConfig.getboolean('General', 'cleanInputDataFilesFromOutput'): shutil.rmtree(dataDir) #os.makedirs(dataDir) write2FileStdout( fh, '* Removed mz[X]ML and DTA files from data directory \'%s\' (cleanInputDataFilesFromOuput is True)' % (dataDir)) else: write2FileStdout( fh, '* Leaving mz[X]ML and DTA files in data directory \'%s\' (cleanInputDataFilesFromOuput is False)' % (dataDir)) if 'cleanIntermediateFiles' in generalSectionMap: denovoOutputDir = tg_output + '/' + experimentName + '/de_novo/' taggraphOutputDir = tg_output + '/' + experimentName + '/taggraph/' experimentOutputDir = tg_output + '/' + experimentName if True == theConfig.getboolean('General', 'cleanIntermediateFiles'): shutil.rmtree(denovoOutputDir) shutil.rmtree(taggraphOutputDir) if os.path.exists(dataDir): shutil.rmtree(dataDir) shutil.rmtree(experimentOutputDir) files = os.listdir(tg_output) for file in files: if (file.endswith(".tdv") or (file.find("_CHECK.txt.") > 0) or file.endswith(".db") or file.endswith(".log")): if (os.path.exists(os.path.join(tg_output, file))): write2FileStdout( fh, "remove %s" % os.path.join(tg_output, file)) os.remove(os.path.join(tg_output, file)) else: write2FileStdout( fh, "keeper %s" % os.path.join(tg_output, file)) write2FileStdout( fh, '* Removed mz[X]ML and Intermediate files from output directory \'%s\' (cleanIntermediateFiles is True)' % (dataDir)) else: write2FileStdout( fh, '* Leaving mz[X]ML and Intermediate files in output directory \'%s\' (cleanIntermediateFiles is False)' % (dataDir)) write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, '*** END CLEANING UP') write2FileStdout(fh, MSGBORDER) write2FileStdout(fh, '%s' % TAGGRAPH_CONFIG_FOOTER) write2FileStdout( fh, '**** end TagGraph process: %s' % (datetime.datetime.now())) fh.close() #move file back to output folder: toDest = tg_output + "runReport.log" shutil.move(runCapture, toDest) sys.exit(0)
from DataFile import * file = DataFile("testCsv.csv") print(str(file.getRowCout())) # file.appendEntry("g;Lukas;Bern") print(str(file.getRowCout())) file.removeEntryByColumnAndIdentifier(0,'g')
def getSpectrumAndPSMFeatureDict(LADSSeqInfo, seqEntry, scanFDict, pairConfig, PNet): featureList = [] lightScans = seqEntry[0] heavyScans = seqEntry[1] lightSpecs = [DataFile.getMassIntPairs(scanFDict[int(lightScanF)]['dta']) for lightScanF in lightScans] heavySpecs = [DataFile.getMassIntPairs(scanFDict[int(heavyScanF)]['dta']) for heavyScanF in heavyScans] avgLightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in lightScans])) epSTD = options.ppmstd * 10**-6 * avgLightPrecMass specs = [] for i, massIntPairs in enumerate(lightSpecs): specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)] for i, massIntPairs in enumerate(heavySpecs): specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)] for spec in specs: spec.initializeNoiseModel() clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD) GLFD.addClusterPairingStatsToFeatureList(clusterPairingStats, featureList) scoreStats = {} truePMs = {} prmLadders = {} for PSM in LADSSeqInfo[seqEntry]: lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2]) scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2]) prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True) truePMs[PSM[:2]] = prmLadderWithEnds[-1] prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1] PSMList = scoreStats.keys() spectrumOrderedScoreStats, clusterScoreStats = GLFD.compileScoreStats(scoreStats, specs, PSMList) spectrumAndPSMSpecificFeatureDict = {} PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)]) for i, PSM in enumerate(LADSSeqInfo[seqEntry]): PSMSpecificFeatureList = copy.copy(featureList) peptLength = len(prmLadders[PSM[:2]]) + 1 # Add LADS PScore (and normalized variants) and delta rank, delta score (LADS PScore) to feature list PSMSpecificFeatureList += [PSM[0], PSM[0]/peptLength, PSM[0]/len(specs), -i, PSM[0]-LADSSeqInfo[seqEntry][0][0]] # Add Total Path Score (and normalized variants) and delta rank, delta score (total path score) and total minimum node score to feature list totalPathScore = scoreStats[PSM[:2]]['Total Path Score'] PSMSpecificFeatureList += [totalPathScore, totalPathScore/peptLength, totalPathScore/len(specs), -clusterScoreStats['PSM Rankings'][PSMIndexDict[PSM[:2]]], totalPathScore-clusterScoreStats['Max Cluster Path Score'], scoreStats[PSM[:2]]['Total Minimum Node Score']] # Add minimum path score, maximum path score, (and normalized variants) and minimum score/maximum score for cluster to feature list PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Minimum Path Score'], scoreStats[PSM[:2]]['Minimum Path Score']/peptLength, scoreStats[PSM[:2]]['Maximum Path Score'], scoreStats[PSM[:2]]['Maximum Path Score']/peptLength, scoreStats[PSM[:2]]['Minimum Path Score']/scoreStats[PSM[:2]]['Maximum Path Score']] # Add difference between minimum and maximum ranking for PSM across cluster to feature list rankingsForPSM = [spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]] for i in spectrumOrderedScoreStats] PSMSpecificFeatureList += [min(rankingsForPSM) - max(rankingsForPSM)] #Add Number forbidden node pairs (and normalized variants) to feature list numForbiddenPairs = Discriminator.getNumForbiddenPairs(prmLadders[PSM[:2]], avgLightPrecMass) PSMSpecificFeatureList += [numForbiddenPairs, 2.0*numForbiddenPairs/(peptLength-1)] # Add number of ambiguous edges to feature list PSMSpecificFeatureList += [len(PSM[2])] # Add stats for PRM Evidence over cluster (and normalized variants) to feature list PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence']/float(peptLength-1)] # Add stats for paired PRMs and their corresponding ion types to feature list pairedPRMStats = Discriminator.getPairedPRMStats(prmLadders[PSM[:2]], clusterPairingStats['Light Merged Spec'], clusterPairingStats['Heavy Merged Spec'], lightSpecs, heavySpecs, clusterPairingStats['Cluster Paired PRM Information'], epSTD=epSTD) GLFD.addPairedPRMStatsToFeatureList(pairedPRMStats, PSMSpecificFeatureList, len(prmLadders[PSM[:2]])) pairedPRMLadder = pairedPRMStats['Paired PRM Ladder'] for i, scan in enumerate(lightScans): spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList) # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]] numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]] spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]] # Add mass deviation from true peptide mass to feature list precMass = scanFDict[scan]['precMass'] spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)] peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet) GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength) GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList) spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength] spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList for j, scan in enumerate(heavyScans): i = j + len(lightScans) spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList) # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]] numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]] spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]] # Add mass deviation from true peptide mass to feature list precMass = scanFDict[scan]['precMass'] spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + pairConfig['NMod'] + pairConfig['CMod'] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)] peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet) GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength) GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList) spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength] spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList return spectrumAndPSMSpecificFeatureDict
return np.unique(scanFs) if __name__ == '__main__': options = ArgLib.parse([ 'init', 'lads', 'sequest', 'mascot', 'pepnovo', 'output', 'database', 'symbolmap', 'pnovo', 'peaks', 'combined' ]) paramsDict = ArgLib.parseInitFile(options.init, options) progDict = ArgLib.getProgDict(DataFile.searchprogs, options) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap(progDict, symbolMap, paramsDict) dbDict = DataFile.getDBInfo(options.database) processedInfo = {} if options.lads: LADSdict = eval(options.lads) for tdvfile in LADSdict.keys(): LADSScanInfo = DataFile.getScanInfo(tdvfile, dbDict['LADS']['fields'], delimiter='\t') processedInfo[LADSdict[tdvfile]] = DataFile.preprocessLADSScanInfo( LADSScanInfo, seqMap[LADSdict[tdvfile]], paramsDict['Pair Configurations'], dbDict['LADS']['fieldmap']) if options.pepnovo: pepNovoDict = eval(options.pepnovo)
def addSingleModCandidates(scanF, scan_items, mod_lists, mod_ranges_list, mod_tuples_list, enumerated_mods, scanFDict, expand_mods, hashed_single_mods, prob_network, ep_step=0.01, mod_tolerance=0.1, ppmSTD=10, aa_spread=3): #print enumerated_mods #for i in range(len(scan_items)): # print scan_items[i] # print mod_lists[i] # print mod_tuples_list[i] # print '-----------------' candidates_map = {} mod_range_map = {} # print enumerated_mods for i, item in enumerate(scan_items): # print 'Scan', scan_items[i] for j, mod in enumerate(mod_lists[i]): if mod[0][0] == 'Isobaric Substitution': continue context = item[1] seq_without_ends = context[2:-2] mod_range = mod_ranges_list[i][j] if (context, mod_range, mod[0][0]) in candidates_map: continue if j > 0: start_cut = max( [mod_ranges_list[i][j - 1][1], mod_range[0] - aa_spread]) else: start_cut = max([0, mod_range[0] - aa_spread]) try: end_cut = min( [mod_range[1] + aa_spread, mod_ranges_list[i][j + 1][0]]) except IndexError: end_cut = min( [mod_range[1] + aa_spread, len(seq_without_ends)]) repl_mods = [] replace_seqs = [] mod_class = Validator.getModClass([mod_tuples_list[i][j]]) #has_defined_mod = any([ Validator.getModClass([mod_tuple]) == 1 for mod_tuple in enumerated_mods[(context, mod_range)] ]) # Don't expand if mod is an AASub and there already exists a defined mod for the same mod_range/context #if mod_class > 1 and has_defined_mod: # continue # print 'Mod', j, mod, start_cut, end_cut for repl_mod, error in hashed_single_mods[hashMass( mod[0][1], ep_step)]: # Don't enumerate mod if already enumerated or if mod error exceeds mod_tolerance repl_mod_error = (0 if not mod[0][2] else mod[0][2]) + error # print repl_mod, repl_mod_error, repl_mod[0], mod_tuples_list[i][j], mod_tuples_list[i][j][0], expand_mods[repl_mod], expand_mods[mod_tuples_list[i][j]] # print enumerated_mods[(context, mod_range)], context, mod_range if repl_mod[ 0] == 'undefined mass shift' or repl_mod in enumerated_mods[ (context, mod_range )] or abs(repl_mod_error) > mod_tolerance: # Don't add candidate if candidate already exists for scan item or mod_error of replacement exceeds tolerance or if it is an undefined mod continue elif mod_class == 1 and not ( repl_mod[0] == mod_tuples_list[i][j][0] and expand_mods[repl_mod] > expand_mods[mod_tuples_list[i][j]]): # If mod is a defined mod, only expand if the replacement mod has the same name but a different and more prevalent localization continue # print 'replace candidate', mod, repl_mod, error, repl_mod_error, context, seq_without_ends[start_cut:end_cut] repl_mods += [(repl_mod, repl_mod_error)] term = getTerminus(start_cut, end_cut, seq_without_ends) subseq = seq_without_ends[start_cut:end_cut] for repl_mod, error in repl_mods: # add candidates try: locs = getModLocs(subseq, term, repl_mod) for loc in locs[1]: replace_seqs += getSingleModSeq( subseq, repl_mod, loc, error) except IndexError: # Index error occurs when subseq is blank. This happens if an insertion is sandwhiched between two mods exactly or is sandwhiched between a mod and a terminus. Doesn't happen normally, but can happen in special cases, such as when the first amino acid in the context is an oxidized methionine and the Insertion is N-terminal to this # TODO: See if this case happens frequently enough with carbamidomethyls to mess up the results pass if len(replace_seqs) > 0: candidates_map[(context, mod_range, mod[0][0])] = replace_seqs mod_range_map[(context, mod_range)] = (start_cut, end_cut) # print candidates_map new_scan_items = [] if len(candidates_map) > 0 and scanF in scanFDict: precMass = scanFDict[scanF]['precMass'] epSTD = ppmSTD * precMass * 10**-6 spec = PN.Spectrum(prob_network, precMass, Nmod=0.0, Cmod=0.0, epsilon=2 * epSTD, spectrum=DataFile.getMassIntPairs( scanFDict[scanF]['dta']), useMemo=True) spec.initializeNoiseModel() # Generate new peptide candidates for i, item in enumerate(scan_items): for j, mod_range in enumerate(mod_ranges_list[i]): # add candidate if (item[1], mod_range, mod_lists[i][j][0][0]) in candidates_map: # print '----------------' # print item[1] # print mod_range # print mod_lists[i][j] # print mod_ranges_list # for candidate in candidates_map[(item[1], mod_range, mod_lists[i][j][0][0])]: # print candidate new_scan_items += [ getSingleModCandidate( spec, scanFDict[scanF]['charge'], item, mod_lists[i], mod_ranges_list[i], j, mod_range_map[(item[1], mod_range)], candidate[0], candidate[1], candidate[2], candidate[3]) for candidate in candidates_map[( item[1], mod_range, mod_lists[i][j][0][0])] ] #print 'Num Single Mods Before getUniqueCandidates', len(new_scan_items) return getUniqueCandidates(new_scan_items)
dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\' dtaNames = DataFile.getDTAFNamesInDir(dirPath) scansIter = iter(dtaNames) currScanInfo = scansIter.next() for dta in dtaNames: precMass = DataFile.getPrecMassAndCharge(dta)[0] spectra = DataFile.getMassIntPairs(dta) S = Spectrum(PN, precMass, 0.0, 0.0, spectra) corr = S.correctParentMass() if np.abs(corr) > 0.04: print dta, corr """ paramsDict = DataFile.parseParams( '/home/arun/Documents/LADS_SILAC_Trypsin.ini') print getPRMLadder('A', 0) """ heavyPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3367.3367.1.dta" lightPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3383.3383.1.dta" heavyPairs = DataFile.getMassIntPairs(heavyPath) lightPairs = DataFile.getMassIntPairs(lightPath) heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath) lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath) heavySpec = Spectrum(PN, heavyPrecMass, 0, mods['*'], heavyPairs) lightSpec = Spectrum(PN, lightPrecMass, 0, 0, lightPairs) heavySpec.initializeNoiseModel() lightSpec.initializeNoiseModel() print heavySpec.noiseModel print lightSpec.noiseModel
import requests import time import os, sys from bs4 import BeautifulSoup import DataFile import time import Mail import Template from ast import literal_eval from itertools import chain url_prefix = "http://tj01.tupu.hb.ted:28026" get_word_loc = "http://10.143.54.80:81/vr_query_period/vr_query_garbled_tupu.txt" word_file = "./word_tupurec" word_list = DataFile.read_file_into_list("./word_tupurec") result_file = './tupurec_garbled_result' report_tmp_path = "mail_detail.html" mail_to = "*****@*****.**" f_res = open(result_file, 'w', encoding='utf8') def log_info(str): time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) sys.stdout.write('[%s] [info] %s\n' % (time_str, str)) sys.stdout.flush() def utf8stdout(in_str): utf8stdout = open(1, 'w', encoding='utf-8',
def getSharedPeaksRatio(lightPath, heavyPath, epsilon): lightPairs = DataFile.getMassIntPairs(lightPath) heavyPairs = DataFile.getMassIntPairs(heavyPath) N, C = SA.getNandCIons(lightPairs, heavyPairs, pairConfig['NMod'], pairConfig['CMod'], epsilon=epsilon) return SA.getSharedPeaksRatio(lightPairs, heavyPairs, N, C)
p.join() for l in L: for j in l: outFile.write(str(j) + '\t') outFile.write('\n') if __name__ == '__main__' : options = ArgLib.parse(['init', 'dtadir', 'config', 'model', 'output', 'columns', 'verbose', 'paircutoff', 'ppmsyserror', 'ppmstd', 'ppmpenalty', 'ambigpenalty', 'minedge', 'maxedge', 'alpha', 'subgraphcut', 'symbolmap']) epStep = 0.00025 maxEp = 0.1 paramsDict = ArgLib.parseInitFile(options.init, options) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict) if options.columns: with open(options.columns) as fin: cols = pickle.load(fin) else: print 'Using default cols' cols = ['light scan', 'heavy scan', 'pair configuration', 'M+H', 'score', 'seq', 'epsilon', 'ambiguous edges', 'num ambig edges'] if options.output: outFile = open(options.output, 'w') outFile.write('\t'.join([col.upper() for col in cols]) + '\n') PNet = PN.ProbNetwork(options.config, options.model) dtaList = glob.glob(options.dtadir + '/*.dta')
dirPath = 'C:\\Users\\Arun\\Pythonprojects\\DeNovoSequencing\\LF2_short_HCD+CID_ath001862_244\\' dtaNames = DataFile.getDTAFNamesInDir(dirPath) scansIter = iter(dtaNames) currScanInfo = scansIter.next() for dta in dtaNames: precMass = DataFile.getPrecMassAndCharge(dta)[0] spectra = DataFile.getMassIntPairs(dta) S = Spectrum(PN, precMass, 0.0, 0.0, spectra) corr = S.correctParentMass() if np.abs(corr) > 0.04: print dta, corr """ paramsDict = DataFile.parseParams('/home/arun/Documents/LADS_SILAC_Trypsin.ini') print getPRMLadder('A', 0) """ heavyPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3367.3367.1.dta" lightPath = "C:\\Users\\Arun\\DropBox\\SpectraCorrelation\\244.3383.3383.1.dta" heavyPairs = DataFile.getMassIntPairs(heavyPath) lightPairs = DataFile.getMassIntPairs(lightPath) heavyPrecMass, heavyCharge = DataFile.getPrecMassAndCharge(heavyPath) lightPrecMass, lightCharge = DataFile.getPrecMassAndCharge(lightPath) heavySpec = Spectrum(PN, heavyPrecMass, 0, mods['*'], heavyPairs) lightSpec = Spectrum(PN, lightPrecMass, 0, 0, lightPairs) heavySpec.initializeNoiseModel() lightSpec.initializeNoiseModel() print heavySpec.noiseModel print lightSpec.noiseModel
for i, feature in enumerate(featureNames): print '%i. %s: %f' % (i+1, feature, featureList[i]) if __name__ == '__main__': print 'This program generates LETOR format training data for the training of a discriminator. dtadir is of the formate {/loc of dtadir: (loc of LADS SequenceDTAsTDV.py LOG file, loc of combined SEQUEST-MASCOT database results' options = ArgLib.parse(['init', 'dtadir', 'ppmstd', 'symbolmap', 'output', 'model', 'config']) paramsDict = ArgLib.parseInitFile(options.init, options) pairConfigurations = paramsDict['Pair Configurations'] ppm = float(options.ppmstd) dtadirInfo = eval(options.dtadir) with open(options.symbolmap, 'r') as fin: symbolMap = pickle.load(fin) seqMap = DataFile.generateSeqMap({'LADS Unit Test': 'LADS'}, symbolMap, paramsDict) seqMap = seqMap['LADS Unit Test'] PNet = PN.ProbNetwork(options.config, options.model) outFile = open(options.output, 'w') featureNames = generateFeatureNames(PNet) #printFeatureNames(featureNames) heavySeqMaps = {} for confName in pairConfigurations: heavySeqMaps[confName] = copy.deepcopy(seqMap) heavySeqMaps[confName]['Mods']['N-Term'] = pairConfigurations[confName]['NModSymbol'] heavySeqMaps[confName]['Mods']['C-Term'] = pairConfigurations[confName]['CModSymbol']
if __name__ == '__main__': print 'dtadir is the directory containing the mzXML files to analyze' print 'peaks is a dictionary mapping {experiment_name: peaks csv}' print 'output is the directory to move all files to and set up the project in' options = ArgLib.parse(['init', 'dtadir', 'peaks', 'output']) print 'options.output: %s' % (options.output) print 'normpath(options.output): %s' % (os.path.normpath(options.output)) # Fails with an OSError if directory already exists os.makedirs(options.output) # Create database args = ['--sqlite', os.path.join(options.output, 'results.db')] print 'Models.py dir: %s' % (DATABASE_SCRIPT_DIR) DataFile.executeProcess(DATABASE_SCRIPT_DIR, 'Models.py', args) # Make experiment directories # Structure # /options.output # .../ExperimentName # ...../data # ...../de_novo # ...../taggraph for experiment, peaks_file in eval(options.peaks).items(): experiment_dir = os.path.join(options.output, experiment) os.makedirs(experiment_dir) # Make the de_novo subdirectory peaks_dir = os.path.join(experiment_dir, 'de_novo') os.makedirs(peaks_dir)
# print key, MASCOTInfo[key] """ seq = '-K*M#--' ambigEdges=[(0,1000),(0,2000),(0,4000)] paramsDict = DataFile.parseParams('./Misc/LADS_SILAC_Trypsin.ini') seqMap = DataFile.generateSeqMap(['SEQUEST', 'MASCOT', 'LADS'], paramsDict) Constants.NTermMods['['] = 10000 Constants.CTermMods[']'] = 20000 Constants.NTermMods['^'] = 40000 seqMap['LADS']['Mods']['N-Term'] = '[' seqMap['LADS']['Mods']['$'] = '^' seqMap['LADS']['Mods']['C-Term'] = ']' #nodeGen = Constants.nodeInfoGen(seq, considerTerminalMods=True, addTerminalNodes=True, ambigEdges=ambigEdges) #for node in nodeGen: #print node newSeq = preprocessSequence(seq, seqMap['LADS'], replaceExistingTerminalMods=True, ambigEdges=copy.copy(ambigEdges)) print newSeq print getPRMLadder(newSeq, ambigAA='X', addEnds=True, ambigEdges=copy.copy(ambigEdges)) """ """ #print comparePeptideResults('AAKKIKK', 'KAAIKKK') dirPath = '/home/arun/Proteomics_Data/LF2_short_HCD+CID_ath001862_244' heavyPath = dirPath + '244.3383.3383.1.dta' lightPath = dirPath + '3760.0160.0160.3.dta' """ dbInfo = DataFile.getDBInfo('./Misc/searchformatparams.pag') compInfo = DataFile.getScanInfo('/home/arun/Downloads/ath009552_ppm5_alpha0.90_min300_max500_PC0.05_SGCut300_SEQUEST_CompareSearches.py_UnitTest.tdv', delimiter='\t') writeFASTAFile(compInfo, 'LADS Unit Test', dbInfo['infoMap'], 'test.txt', ambigEdgeCutoff=1)
if __name__ == '__main__': ''' Maxkeys = int(input('Enter max number of keys : ')) t = HardCodeTree(Maxkeys) t.PrintIndexFile() ans = t.SearchKeyRec(88) if ans: print(ans[0],ans[1]) else: print('NONE') ''' NumberOfRecords = int(input('Enter number of records : ')) Maxkeys = int(input('Enter max number of keys : ')) DataFile.createDataFile(NumberOfRecords) t = CreateBPlusTree('dataFile.bin', Maxkeys) t.PrintIndexFile() while True: try: key = int(input('\nEnter key to be searched : ')) n = int(input('\nEnter number of keys to be fetched : ')) except: print('The value of key must be a positive number !!') keytup = t.SearchKeyRec(key) if keytup: keytup = keytup[1] print('FOUND !!! The given key exists as -> ', keytup)
default='', help= 'Prefix for FMIndex output files. Base of Fasta input file will be used if not supplied.' ) parser.add_argument('fasta', help='Input FASTA filename. Must end with .fasta') args = parser.parse_args() if not os.path.basename(args.fasta.lower()).endswith('.fasta'): raise FileNotFoundError( "Error! FASTA input {} doesn't end with .fasta!".format(args.fasta)) if args.output == '': output_filename = os.path.basename(args.fasta[:-6]) print( 'BLANK OUTPUT basename - using the FASTA input file base: {}'.format( output_filename)) args.output = output_filename Database.makeDBForFMIndexFromFASTA(args.fasta, args.output) fmbuild_loc = os.path.abspath( os.path.join(os.path.join(PAR_DIR, 'lib'), 'fmindex')) for fm_formatted in glob.glob(args.output + '*fmFormatted*'): DataFile.executeProcess(fmbuild_loc, 'fmbuild', [ '-v', fm_formatted, args.output + '.fm%s' % os.path.splitext(fm_formatted)[1] ], interpreter=False) os.remove(fm_formatted)
paramsDict = ArgLib.parseInitFile(options.init, options) infoDict = eval(options.mainprogname) with open(options.unimoddict) as fin: unimodDict = pickle.load(fin) hashedUnimodDict = hashUnimodDict(unimodDict) outFile = open(options.output, 'w') cols = ['ScanF', 'Score', 'Peptide', 'Unmod Peptide', 'References', 'Modifications', 'DB Peptide', 'Alignment Score'] if 'Ambig Edges' in infoDict: cols.insert(2, 'Ambig Edges') outFile.write('\t'.join([col for col in cols]) + '\n') for entry in DataFile.getScanInfo(options.comp, delimiter='\t'): scanData = {} scanData['ScanF'] = entry[infoDict['ScanF']] scanData['Peptide'] = entry[infoDict['Peptide']] scanData['Unmod Peptide'] = An.stripModifications(scanData['Peptide'], noRemove=[]) scanData['Score'] = entry[infoDict['Score']] scanData['Alignment Score'] = None if 'Ambig Edges' in infoDict: ambigEdges = eval(entry[infoDict['Ambig Edges']]) scanData['Ambig Edges'] = ambigEdges else: ambigEdges = [] deNovoPRMLadder = An.getPRMLadder(scanData['Peptide'], ambigEdges=ambigEdges) refList = eval(entry[infoDict['References']])
def getSharedPeaksRatio(lightPath, heavyPath, epsilon): lightPairs = DataFile.getMassIntPairs(lightPath) heavyPairs = DataFile.getMassIntPairs(heavyPath) N, C = SA.getNandCIons(lightPairs, heavyPairs, 17.0265, -16.0187, epsilon=epsilon) return SA.getSharedPeaksRatio(lightPairs, heavyPairs, N, C)
# Load positions of sequence seperators protein_names = [] for seqnames_file in sorted(glob.glob(index_basename + '.seqnames*')): protein_names += [anydbm.open(seqnames_file)] with open(index_basename + '.offsets') as fin: protein_offsets = pickle.load(fin) # Load FM Index seq_indices = [] for seqnames_file in sorted(glob.glob(index_basename + '.fm*')): seq_indices += [fm.FM_load(seqnames_file)] # Load de novo sequence info de_novo_cols, de_novo_results = DataFile.getScanInfo(options.denovo, delimiter='\t') # Initialize ambiguous edges to empty list if data is not present in de novo results (i.e., not running TAG-GRAPH on LADS) ambig_edges_present = 'Ambig Edges' in de_novo_cols ambig_edges = [] # Prep output file outFile = open(options.output, 'w') cols = [ 'ScanF', 'Alignment Score', 'Rank', 'Context', 'Modifications', 'Proteins', 'Matching Tag Length', 'De Novo Peptide', 'Unmod De Novo Peptide', 'De Novo Score', 'Time Taken' ] if ambig_edges_present: cols.insert(-3, 'Ambig Edges') outFile.write('\t'.join(cols) + '\n')