Exemple #1
0
def file_parser():
  """from bob"s proteinGroups.txt take: Majority protein IDs Peptide counts (razor+unique) ['LFQ intensity KO1', 'LFQ intensity KO2', 'LFQ intensity KO3', 'LFQ intensity WT1', 'LFQ intensity WT2', 'LFQ intensity WT3']
  and write them to a new file. do not select contaminants or reverse peptides"""

  from root.ed.tools import file_importer, file_outporter
  print "this is file parser"
  inpF = file_importer("bob/24h_proteingroups.csv")
  outF = file_outporter("bob/processed/24h_bobdata.csv")
  for inpLine in inpF:
    inpP = inpLine.split("\r")
    cN = 0
    print len(inpP)
    for inpI in inpP:
      inpItems = inpI.split("\t") 
      if inpItems[100] == "+" or inpItems[101] == "+": continue # get rid of contaminants and reverse proteins
      outF.write(str(cN) + "," + inpItems[1] + "," + inpItems[6] + "," + inpItems[3] + "," + inpItems[86] + "," + inpItems[87] + "," + inpItems[88] + "," + inpItems[89] + "," + inpItems[90] + "," + inpItems[91] + "\n")
      # print inpItems [1],"+++", inpItems [3],"+++", inpItems [6],"+++", inpItems[86:92]
      cN += 1
      # if cN == 40: break

    break

  inpF.close()
  outF.close()
  print cN
Exemple #2
0
def entry_parser():
  """remove duplicate protein name and total peptide count cell entries from bob's dataset"""
  from root.ed.tools import file_importer, file_outporter
  
  print "this is entry parser"
  
  relPath = "bob/processed/24h_bobdata.csv"
  outPath = "bob/processed/24h_bobdata_ed.csv"
  inpF = file_importer(relPath)
  outF = file_outporter(outPath)
  cN = 0
  hitN = 0
  for inpLine in inpF:
    cN += 1
    inpLine = inpLine.strip()
    inpItem = inpLine.split(",")
    geneL = inpItem[1].split(";")
    lenS = len(geneL[0])
    curGene = geneL[0]
    for geneI in geneL:
      if len(geneI) < lenS:
        lenS = len(geneI)
        curGene = geneI
    if "__" in curGene: continue
    protL = inpItem[3].split(";")
    curProt = protL[geneL.index(curGene)]
    if curGene[-2] == "-":
      curGene = curGene[:-2]
    if curGene[-3] == "-":
      curGene = curGene[:-3]
    outF.write(inpItem[0] + "," + curGene + "," + inpItem[2] + "," + curProt + "," + inpItem[4] + "," + inpItem[5] + "," + inpItem[6] + "," + inpItem[7]  + "," + inpItem[8] + "," + inpItem[9] + "\n")
    hitN += 1
  print cN, hitN
  inpF.close()
  outF.close()
Exemple #3
0
def stat_parser():
  """take protein names with a significant p value and out them to a result file"""
  from root.ed.tools import file_importer, file_outporter
  from math import log
  
  print "this is stat parser"
  
  relPath = "bob/processed/24h_bobdata_ed2.csv"
  outPathUp = "bob/processed/24h_bobprots_up_full.csv"
  outPathDown = "bob/processed/24h_bobprots_down_full.csv"
  inpF = file_importer(relPath)
  outFUp = file_outporter(outPathUp)
  outFDown = file_outporter(outPathDown)
  
  
  skipFlag = True
  
  for inpLine in inpF:
    if skipFlag:
      skipFlag = False
      outFDown.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n")
      outFUp.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n")
      continue
    inpLine = inpLine.split("\" \"")
    curLine = []
    for inpI in inpLine:
      curLine.append(inpI.strip("\"\n"))
    try: 
      curLine[-1] = float(curLine[-1])
    except ValueError:
      curLine[-1] = 1   
    if curLine[-1] < 0.05 and int(curLine[3]) > 1: # check if protein has at least 2 unique peptides and has a significant p value
      curLine[4:10] = [int(x) for x in curLine[4:10]]
      enrScore = log((sum(curLine[4:7]) / 3.0)/(sum(curLine[7:10]) / 3.0),2) # calculate log2 enrichment score
      # print int(sum(curLine[4:7]) / 3.0), int(sum(curLine[7:10]) / 3.0)
      if sum(curLine[4:7]) / 3.0 > sum(curLine[7:10]) / 3.0: # if the mean of the KO intensities is higher than the wt  
        for outI in curLine:
          outFDown.write(str(outI).strip(" "))
          if outI is not curLine[-1]:
            outFDown.write(",")
            if outI is curLine[-2]:
              outFDown.write(str(enrScore)+ ",")
          else:
            outFDown.write("\n")
        # outFDown.write(curLine[1] + "," + curLine[2] + "\n")
      else:
        # outFUp.write(curLine[1] + "," + curLine[2] + "\n")
        for outI in curLine:
          outFUp.write(str(outI).strip(" "))
          if outI is not curLine[-1]:
            outFUp.write(",")
            if outI is curLine[-2]:
              outFUp.write(str(enrScore)+ ",")
          else:
            outFUp.write("\n")
  
  inpF.close()
  outFUp.close()
  outFDown.close()
  print "stat_parser completed"
Exemple #4
0
def lfq_parser():
  """remove 0 values from lfq measurements and replace them with a random number between 1 and 100
  This is needed for ttseting later in R, as each measurement there has to have some sort of noise in it"""
  from root.ed.tools import file_importer, file_outporter
  from random import randint
  
  print "this is lfq parser"
  
  relPath = "bob/processed/24h_bobdata_ed.csv"
  outPath = "bob/processed/24h_bobdata_no0_ed.csv"
  inpF = file_importer(relPath)
  outF = file_outporter(outPath)  
  headerFlag = True
  for inpLine in inpF:
    if headerFlag: 
      headerFlag = False
      outF.write(inpLine)
      continue
    inpLine = inpLine.strip()
    inpItems = inpLine.split(",")
    try:
      int(inpItems[4]) # get rid of wonky erroneous lines introduced by excel
    except ValueError:
      print "bad line found here, ignored: ", inpItems
      continue
    for inpI in inpItems[0:4]: # copy over gene name and such to new file
      outF.write(inpI)
      outF.write(",")
    
    commaCount = 0
    for inpJ in inpItems[4:]: # copy over lfq values while replacing 0-s with random values
      commaCount += 1
      if int(inpJ) == 0:
        randNum = randint(1,100)
        outF.write(str(randNum))
      else:
        outF.write(inpJ)
      if commaCount < 6:
          outF.write(",")
    outF.write("\n")
  inpF.close()
  outF.close()
Exemple #5
0
def main():
  from root.ed.tools import file_importer, uniprot_dicter, iso_e
  from collections import defaultdict
  print "this is evidence parser"
  inpEvF = file_importer("bob/evidence.txt")
  seqD = uniprot_dicter()
  resD = defaultdict(list)
  for keyS, valueS in seqD.items(): # create dict with keys as all mouse genes and values as a list, with the protein sequence as first item in the list
    resD[keyS].append(valueS.strip())
  geneL = []
  goodC = 0
  badC = 0
  for inpLine in inpEvF:
    inpList = inpLine.split("\t")
    inpL = inpList[14].split(";") # this is the leading protein name column. It holds uniprot IDs separated by ; 
    try:
      inpFrac = int(inpList[20]) # this is the "fraction" column. it holds the isoelectric fractionation number
    except ValueError:
      print inpList[20]
      continue
    for inpI in inpL:
      if inpI[-2] == "-":
        inpI = inpI[:-2]
      if inpI not in geneL: # make a list of gene names
        geneL.append(inpI)
      if inpI[:3] == "REV" or inpI[:3] == "CON": continue # there are some weird entries. CON is contaminants methinks.
      try:
        if inpFrac - 2 < iso_e(resD[inpI][0]) < inpFrac + 2 : # check if the isoelectric point of the protein to which this peptide belongs to matches the fraction it was retrieved from
          goodC += 1
        else:
          badC += 1
          
        """resD[inpI].append()
        resD[inpI].append(iso_e(resD[inpI][0])) # add isoelectric point to the genes found"""
      except IndexError:
        print "%s has no sequence" % inpI
  print str(len(geneL)) + " genes found in evidence.txt"
  print str(goodC) + " matching, and " + str(badC) + " unmatching peptides found" 
  inpEvF.close()  
Exemple #6
0
def intact_parser():
  """open ptpn22.txt and extract prey protein uniprot accessions. 
  Convert those to refseq protein accessions.
  Return them as a list."""
  from root.ed.tools import prot_id_converter, file_importer
  
  relPath = "ptpn22_ppi_data/ptpn22.txt"
  inpF = file_importer(relPath, "r")
  headerFlag = True
  preyL = []
  for inpLine in inpF:
    if headerFlag:
      headerFlag = False
      continue
    inpList = inpLine.split("\t")
    inpItem = inpList[1].split(":")[-1]
    if inpItem not in preyL:
      preyL.append(inpItem)
    inpItem = inpList[0].split(":")[-1]
    if inpItem not in preyL:
      preyL.append(inpItem)    
  inpF.close()
  idList = prot_id_converter(preyL, "", outDB="refseqproteingi") # convert uniprot ID to refseq accessions
  return idList