def file_parser(): """from bob"s proteinGroups.txt take: Majority protein IDs Peptide counts (razor+unique) ['LFQ intensity KO1', 'LFQ intensity KO2', 'LFQ intensity KO3', 'LFQ intensity WT1', 'LFQ intensity WT2', 'LFQ intensity WT3'] and write them to a new file. do not select contaminants or reverse peptides""" from root.ed.tools import file_importer, file_outporter print "this is file parser" inpF = file_importer("bob/24h_proteingroups.csv") outF = file_outporter("bob/processed/24h_bobdata.csv") for inpLine in inpF: inpP = inpLine.split("\r") cN = 0 print len(inpP) for inpI in inpP: inpItems = inpI.split("\t") if inpItems[100] == "+" or inpItems[101] == "+": continue # get rid of contaminants and reverse proteins outF.write(str(cN) + "," + inpItems[1] + "," + inpItems[6] + "," + inpItems[3] + "," + inpItems[86] + "," + inpItems[87] + "," + inpItems[88] + "," + inpItems[89] + "," + inpItems[90] + "," + inpItems[91] + "\n") # print inpItems [1],"+++", inpItems [3],"+++", inpItems [6],"+++", inpItems[86:92] cN += 1 # if cN == 40: break break inpF.close() outF.close() print cN
def entry_parser(): """remove duplicate protein name and total peptide count cell entries from bob's dataset""" from root.ed.tools import file_importer, file_outporter print "this is entry parser" relPath = "bob/processed/24h_bobdata.csv" outPath = "bob/processed/24h_bobdata_ed.csv" inpF = file_importer(relPath) outF = file_outporter(outPath) cN = 0 hitN = 0 for inpLine in inpF: cN += 1 inpLine = inpLine.strip() inpItem = inpLine.split(",") geneL = inpItem[1].split(";") lenS = len(geneL[0]) curGene = geneL[0] for geneI in geneL: if len(geneI) < lenS: lenS = len(geneI) curGene = geneI if "__" in curGene: continue protL = inpItem[3].split(";") curProt = protL[geneL.index(curGene)] if curGene[-2] == "-": curGene = curGene[:-2] if curGene[-3] == "-": curGene = curGene[:-3] outF.write(inpItem[0] + "," + curGene + "," + inpItem[2] + "," + curProt + "," + inpItem[4] + "," + inpItem[5] + "," + inpItem[6] + "," + inpItem[7] + "," + inpItem[8] + "," + inpItem[9] + "\n") hitN += 1 print cN, hitN inpF.close() outF.close()
def stat_parser(): """take protein names with a significant p value and out them to a result file""" from root.ed.tools import file_importer, file_outporter from math import log print "this is stat parser" relPath = "bob/processed/24h_bobdata_ed2.csv" outPathUp = "bob/processed/24h_bobprots_up_full.csv" outPathDown = "bob/processed/24h_bobprots_down_full.csv" inpF = file_importer(relPath) outFUp = file_outporter(outPathUp) outFDown = file_outporter(outPathDown) skipFlag = True for inpLine in inpF: if skipFlag: skipFlag = False outFDown.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n") outFUp.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n") continue inpLine = inpLine.split("\" \"") curLine = [] for inpI in inpLine: curLine.append(inpI.strip("\"\n")) try: curLine[-1] = float(curLine[-1]) except ValueError: curLine[-1] = 1 if curLine[-1] < 0.05 and int(curLine[3]) > 1: # check if protein has at least 2 unique peptides and has a significant p value curLine[4:10] = [int(x) for x in curLine[4:10]] enrScore = log((sum(curLine[4:7]) / 3.0)/(sum(curLine[7:10]) / 3.0),2) # calculate log2 enrichment score # print int(sum(curLine[4:7]) / 3.0), int(sum(curLine[7:10]) / 3.0) if sum(curLine[4:7]) / 3.0 > sum(curLine[7:10]) / 3.0: # if the mean of the KO intensities is higher than the wt for outI in curLine: outFDown.write(str(outI).strip(" ")) if outI is not curLine[-1]: outFDown.write(",") if outI is curLine[-2]: outFDown.write(str(enrScore)+ ",") else: outFDown.write("\n") # outFDown.write(curLine[1] + "," + curLine[2] + "\n") else: # outFUp.write(curLine[1] + "," + curLine[2] + "\n") for outI in curLine: outFUp.write(str(outI).strip(" ")) if outI is not curLine[-1]: outFUp.write(",") if outI is curLine[-2]: outFUp.write(str(enrScore)+ ",") else: outFUp.write("\n") inpF.close() outFUp.close() outFDown.close() print "stat_parser completed"
def lfq_parser(): """remove 0 values from lfq measurements and replace them with a random number between 1 and 100 This is needed for ttseting later in R, as each measurement there has to have some sort of noise in it""" from root.ed.tools import file_importer, file_outporter from random import randint print "this is lfq parser" relPath = "bob/processed/24h_bobdata_ed.csv" outPath = "bob/processed/24h_bobdata_no0_ed.csv" inpF = file_importer(relPath) outF = file_outporter(outPath) headerFlag = True for inpLine in inpF: if headerFlag: headerFlag = False outF.write(inpLine) continue inpLine = inpLine.strip() inpItems = inpLine.split(",") try: int(inpItems[4]) # get rid of wonky erroneous lines introduced by excel except ValueError: print "bad line found here, ignored: ", inpItems continue for inpI in inpItems[0:4]: # copy over gene name and such to new file outF.write(inpI) outF.write(",") commaCount = 0 for inpJ in inpItems[4:]: # copy over lfq values while replacing 0-s with random values commaCount += 1 if int(inpJ) == 0: randNum = randint(1,100) outF.write(str(randNum)) else: outF.write(inpJ) if commaCount < 6: outF.write(",") outF.write("\n") inpF.close() outF.close()
def main(): from root.ed.tools import file_importer, uniprot_dicter, iso_e from collections import defaultdict print "this is evidence parser" inpEvF = file_importer("bob/evidence.txt") seqD = uniprot_dicter() resD = defaultdict(list) for keyS, valueS in seqD.items(): # create dict with keys as all mouse genes and values as a list, with the protein sequence as first item in the list resD[keyS].append(valueS.strip()) geneL = [] goodC = 0 badC = 0 for inpLine in inpEvF: inpList = inpLine.split("\t") inpL = inpList[14].split(";") # this is the leading protein name column. It holds uniprot IDs separated by ; try: inpFrac = int(inpList[20]) # this is the "fraction" column. it holds the isoelectric fractionation number except ValueError: print inpList[20] continue for inpI in inpL: if inpI[-2] == "-": inpI = inpI[:-2] if inpI not in geneL: # make a list of gene names geneL.append(inpI) if inpI[:3] == "REV" or inpI[:3] == "CON": continue # there are some weird entries. CON is contaminants methinks. try: if inpFrac - 2 < iso_e(resD[inpI][0]) < inpFrac + 2 : # check if the isoelectric point of the protein to which this peptide belongs to matches the fraction it was retrieved from goodC += 1 else: badC += 1 """resD[inpI].append() resD[inpI].append(iso_e(resD[inpI][0])) # add isoelectric point to the genes found""" except IndexError: print "%s has no sequence" % inpI print str(len(geneL)) + " genes found in evidence.txt" print str(goodC) + " matching, and " + str(badC) + " unmatching peptides found" inpEvF.close()
def intact_parser(): """open ptpn22.txt and extract prey protein uniprot accessions. Convert those to refseq protein accessions. Return them as a list.""" from root.ed.tools import prot_id_converter, file_importer relPath = "ptpn22_ppi_data/ptpn22.txt" inpF = file_importer(relPath, "r") headerFlag = True preyL = [] for inpLine in inpF: if headerFlag: headerFlag = False continue inpList = inpLine.split("\t") inpItem = inpList[1].split(":")[-1] if inpItem not in preyL: preyL.append(inpItem) inpItem = inpList[0].split(":")[-1] if inpItem not in preyL: preyL.append(inpItem) inpF.close() idList = prot_id_converter(preyL, "", outDB="refseqproteingi") # convert uniprot ID to refseq accessions return idList