Example #1
0
 def __init__(self, blastResults, k=10):
     out.writeDebug("Initalize Blast Alignment by blasting results ...")
     # self stuff
     self.hits = []
     # parse the blast results
     hitPattern = re.compile("<Hit>(.*?)</Hit>", re.DOTALL)
     hitIdPattern = re.compile("<Hit_def>(.*?)</Hit_def>")
     hitEValPattern = re.compile("<Hsp_evalue>(.*?)</Hsp_evalue>")
     hitFromPattern = re.compile("<Hsp_hit-from>(.*?)</Hsp_hit-from>")
     hitToPattern = re.compile("<Hsp_hit-to>(.*?)</Hsp_hit-to>")
     # for each hit in the xml
     i = 0
     for hit in hitPattern.finditer(blastResults):
         i += 1
         if i > k:
             break
         text = hit.group(0)
         hit_id = hitIdPattern.search(text).group(1)
         hit_e_value = hitEValPattern.search(text).group(1)
         hit_from = hitFromPattern.search(text).group(1)
         hit_to = hitToPattern.search(text).group(1)
         self.hits.append(
             {
                 "hit_id": hit_id,
                 "hit_value": float(hit_e_value),
                 "hit_from": int(hit_from),
                 "hit_to": int(hit_to),
                 "hit_order": False,
                 "method": "blast",
             }
         )
Example #2
0
  def trainprediction(self, data=None, biased=False, maxEpochs = 10000):
    """Trains the neural network with the provided trainings data and returns true, if the training was successful"""
    if not data:
      out.writeDebug('No training data! The net stays initialized with random weights!')
      return False

    #create supervised data set from the training nodes
    ds = SupervisedDataSet(len(self.features), 2)
    reduced_dataset = [set([]),set([])]
    for node, target in data:
      featuresValue = []
      for feature in self.features:
        featuresValue.append(feature(self, node, None, node.querySequence))
        
      if target:
        reduced_dataset[0].add(tuple(featuresValue+[ACCEPTED, NOTACCEPTED]))        
      else:
        reduced_dataset[1].add(tuple(featuresValue+[NOTACCEPTED, ACCEPTED]))

    for posInstance, negInstance in zip(reduced_dataset[0],reduced_dataset[1]):
      ds.addSample(posInstance[:-2],posInstance[-2:])
      ds.addSample(negInstance[:-2],negInstance[-2:])

    if biased:
      ds = SupervisedDataSet(len(self.features), 2)
      for instance in reduced_dataset[0]:
        ds.addSample(instance[:-2],instance[-2:])      
      for instance in reduced_dataset[1]:
        ds.addSample(instance[:-2],instance[-2:])
    out.writeDebug('Start training neural net with %s training examples. Dataset bias is set to %s'%(len(ds), biased ))
    trainer = BackpropTrainer(self.net, ds)
    trainer.trainUntilConvergence(maxEpochs = maxEpochs)
    
    return True      
Example #3
0
 def localHHBLITS(seq = "NWLGVKRQPLWTLVLILWPVIIFIILAITRTKFPP", database = "../data/PP2db", minEVal = 1):
   import time
   out.writeDebug("Do a local hhblits search for {} in {}".format( seq, database ) )
   char_set = string.ascii_uppercase + string.digits
   time_stamp = str(int(time.time())) + ''.join(random.sample(char_set*6, 40)) + str(random.randint(0, 100000000))
   seq_file = ''
   seq = str(seq)
   if re.match('^[A-Z]*$',seq):
     seq_file = 'hhblits_input_'+time_stamp
     fh = open(seq_file,'w')
     fh.write('>no_header\n'+seq)
     fh.close()
   else:
     seq_file = seq
   outfile = 'hhblits_'+time_stamp+'.out'
   command = "hhblits -i {} -o {} -d {} -e {} -n 1".format(seq_file, outfile, database, minEVal)
   try:
     hhblitsResults = subprocess.check_output(command, stderr=subprocess.STDOUT, shell = True)
     hhblitsResults = open(outfile).read()
     os.remove(outfile)
   except subprocess.CalledProcessError as err:
     out.writeLog("Return code for hhblits search {} in {} returned with exit code {}!".format( seq, database, err.returncode ) )
     hhblitsResults = ''
   if seq != seq_file:
     os.remove(seq_file)
   return HHBLITS(hhblitsResults)
Example #4
0
 def __init__(self, hhblitsResults, k = 10):
   out.writeDebug("Initalize hhblits alignment ...")      
   # self stuff
   self.hits = []
   if hhblitsResults == '#unknown error':
     return None
   # parse the hhblits results
   beginOfResults = False
   lines = hhblitsResults.split('\n')
   tmp = []
   i = 0
   for line in lines:
     # skip empty lines
     if not line.strip():
       continue
     # skip empty lines
     if line.find('Done') != -1:
       break
     # skip all lines before the actual searching results begin
     if beginOfResults:
       i += 1
       if line.startswith('No 1') or i > k:
         #end of results
         break
       items = line.split()
       hit_id, hit_e_value, query_hit = items[1], items[3], items[9]
       hit_from, hit_to = query_hit.split('-')[:2]
       hit_to = hit_to.split('(')[0]
       self.hits.append({'hit_id':hit_id, 'hit_value': float(hit_e_value), 'hit_from':int(hit_from), 'hit_to': int(hit_to), 'hit_order': False, "method" : "hhblits"})
     else:
       if line.find("No Hit") != -1:
         beginOfResults = True
Example #5
0
 def localBlast(seq="NWLGVKRQPLWTLVLILWPVIIFIILAITRTKFPP", database="../data/genes_UniProt.fasta", minEVal=1):
     out.writeDebug("Do a local blast search for {} in {}".format(seq, database))
     blastResults = commands.getstatusoutput(
         'echo "{}" | blast2 -p blastp -d {} -N -e {} -m 7'.format(seq, database, minEVal)
     )
     if blastResults[0] != 0:
         out.writeLog(
             "Return code for blast search {} in {} returned with exit code {}!".format(
                 seq, database, blastResults[0]
             )
         )
     return Blast(blastResults[1])
Example #6
0
def learn_parameters(hpoGraph, uni2hpoDict, dataset):
    out.writeDebug('Start training the predictor.')
    from predictor import Predictor
    neuralNet = Predictor(None)
    # in crosstraining, the test set is the crossTrain and the crossTrain set is the (here ignored) test set
    crossTrainSet = {'train': dataset['train'], 'crossTrain': dataset['test'], 'test': dataset['crossTrain']}
    
    trainingNodes = train_result_set(hpoGraph, uni2hpoDict, crossTrainSet)
    out.writeDebug('Collected all the nodes for training')
    
    if shortcut:
        neuralNet.trainprediction(trainingNodes, maxEpochs = 10)
    else:
        neuralNet.trainprediction(trainingNodes)

    return neuralNet
Example #7
0
  def runprediction(self, querySequence, graph):
    
    """ this function marks all nodes in the graph as accepted, if the prediction they are predicted to be positive, Note, that node.accpeted should be false for all nodes.
    This function set the node.accpeted = True, for all nodes that are accepted by the predictor. """
#    def acceptNodeAndParentNodes(graph, node):
#      node.accepted = True
#      stack = graph.getParents(node)
#      while len(stack) != 0:
#        cNode = graph.getHpoTermById(stack.pop())
#        cNode.accepted = True
#        stack.extend(graph.getParents(cNode))
    
    for cNodeID, cNode in graph.hpoTermsDict.iteritems():
      # ok, get the node to predict
      out.writeDebug("Perform prediction for node: {}".format(cNode.id))
      # get all features for the current node
      featuresValue = []
      for feature in self.features:
        featuresValue.append(feature(self, cNode, graph, querySequence))
      # ok, now run the neuronal network
      predictionResult = self.net.activate(featuresValue)
      out.writeDebug("Prediction result for node {} = {}".format(cNode.id, predictionResult))
      # check the prediction result
      # difference should be between -2 (lowest confidence) and 2 (highest confidence)
      # ideally, the predictionResult is (1,-1) for accepted, (-1,1) for not accepted
      confidence =  predictionResult[0] - predictionResult[1]
      out.writeDebug("Prediction result: {}".format(confidence))
      # ok, set accepted attribute to confidence
      cNode.accepted = confidence
Example #8
0
def reduce_sequences():
    # use set of sequences with reduced sequenc redundancy as basis for validation
    # set created with CD-HIT at 80% sequence similarity
    out.writeDebug('Prepare sequence similarity reduced data set from %s'%reducedFile)
    reduced_sequences = []

    for record in SeqIO.parse(open(reducedFile), 'fasta'):
        reduced_sequences.append((record.id, record.seq))
    shuffle(reduced_sequences)

    # also take care to reserve sequences that are in the same cluster as the test sequences

    out.writeDebug('Digest sequence clusterings from %s'%clusterFile)
    sequenceCluster = {}
    representative = ''
    sequences = set([])

    for line in open(clusterFile):
        if not line.strip():
            continue
        if  line.startswith('>'):
            #new cluster:
            if representative:
                sequenceCluster[representative] = sequences

            representative = ''
            sequences = set([])
        else:
            sequence = line.split('>')[1].split('.')[0]
            sequences.add(sequence)
            if '*' in line:
                # representative sequences have a star
                representative = sequence
    sequenceCluster[representative] = sequences

    return (reduced_sequences,sequenceCluster)
Example #9
0
def cross_validate(sequences, folds = 10):
    import hpoParser, time
    starttime = time.time()
    dataset_size = len(sequences)
    allPredictions = []

    # init the hpoParser
    hpoGraph = hpoParser.HpoGraph(hpoFile)

    # init the hpo-identifier dict
    out.writeDebug('Initialize dictionary with true annotations from %s'%hpoMappingFile)
    uni2hpoDict = {}
    f = open(hpoMappingFile)
    for line in f:
        line = line.strip()
        uni2hpoDict.update( { line.split("\t")[0] : line.split("\t")[1].split(",") } )
    f.close()

    # create folds
    for i in range(folds):
        now = (time.time() - starttime)/60
        minutes = int(now%60)
        hours = int(now/60)
        out.writeDebug('Start with fold %s from %s'%(i+1, folds))
        out.writeDebug('Time elapsed: %s:%s'%(hours, minutes))
        # test fold
        test = sequences[i:dataset_size:folds]
        # fold to learn parameters
        crossTrain = sequences[(i+1)%folds:dataset_size:folds]
        # fold to train on, does not include the redundant sequences here
        # is not really necessary, since train and crosstrain are preserved
        train = []
        for j in range(folds):
            if j != i and j != (i+1)%folds:
                train = train + sequences[j:dataset_size:folds]
        dataset = {'train': train, 'crossTrain': crossTrain, 'test': test}
        # learn the parameters, however they will look
        # parameters should be neural net to recognize valid annotation
        predictor = learn_parameters(hpoGraph, uni2hpoDict, dataset)
        predictor.saveNeuronalNetwork('neuronalNetwork_Fold%s'%i)
        # test the parameters on the independent test fold
        allPredictions.append(predict_set(hpoGraph, uni2hpoDict, dataset, predictor))

        predictions = allPredictions[-1]
        print '***fold %s (FN = %s):***'%((i+1), predictions[1])
        for predictedSequence, predictedTerms in predictions[0]:
                for predictedNode in predictedTerms:
                    print predictedSequence, predictedNode.id, predictedNode.accepted, predictedNode.TruePrediction

        if shortcut:
            break
Example #10
0
def train_result_Sequence(hpoGraph, uni2hpoDict, dataset, name='', seq=''):
    out.writeDebug('Get training data for sequence name %s with sequence: %s'%(name, seq))

    import blast, hhblits
    # similar sequence
    blastResults = blast.Blast.localBlast(seq=seq, database=blastDbFile)
    hhblitsResults = hhblits.HHBLITS.localHHBLITS(seq=str(seq), database=hhblitsDbFile)
    
    # now get the hpo-Identifiers for each similar sequence
    for hit in blastResults.hits:
        hit.update( { "hpoTerms" : uni2hpoDict[ hit[ "hit_id" ] ] } )
    for hit in hhblitsResults.hits:
        hit.update( { "hpoTerms" : uni2hpoDict[ hit[ "hit_id" ] ] } )

    # set of hits to ignore to avoid information leakage
    reserved = set([])
    reserved.add(name)
    
    # add the sequences in the associated clusters
    #for representative, sequence in dataset['crossTrain']:
    #    reserved = reserved | sequenceCluster[representative]
        
    for representative, sequence in dataset['test']:
        reserved = reserved | sequenceCluster[representative]   

    
    # build and merge trees
    graph, hit_id = None, 0
    
    for hit in blastResults.hits:
        #take only hits from the training set, ignore hits from test or crosstrain set
        if hit['hit_id'] in reserved:
            out.writeDebug('Skip hit %s in database that is in the test data'%(hit['hit_id']))
            continue
        subtree = hpoGraph.getHpoSubGraph( hit[ 'hpoTerms' ], { hit_id : hit } )
        hit_id += 1
        if graph == None:
            graph = subtree
        else:
            graph += subtree
    for hit in hhblitsResults.hits:
        if hit['hit_id'] in reserved:
            out.writeDebug('Skip hit %s in database that is in the test data'%(hit['hit_id']))
            continue
        subtree = hpoGraph.getHpoSubGraph( hit[ 'hpoTerms' ], { hit_id : hit } )
        hit_id += 1
        if graph == None:
            graph = subtree
        else:
            graph += subtree

    
    
    # get training nodes
    trainingNodes = []
    if graph != None:
        for node in graph.hpoTermsDict:
            if node == 'HP:0000001':
                continue
            ValidPrediction = False
            if node in uni2hpoDict[name]:
                ValidPrediction = True
            graph.hpoTermsDict[node].querySequence = seq
            # copy node attributes for training
            trainingNodes.append((graph.hpoTermsDict[node].copy(), ValidPrediction))
            
    hpoGraph.clearAttr()

    # return the set of trainings nodes with target variable
    return trainingNodes    
Example #11
0
 def __init__(self, hpoFile="../data/hp.obo"):
   
   """ initalize an hpo graph by an hpo file """
   
   # debug message
   if hpoFile != None:
     out.writeDebug( "parsing hpo file " + str( hpoFile ) )
   # init main class varibale
   self.hpoTermsDict = {}
   self.isSubTree = hpoFile == None
   # if the file to parse is None, an empty HpoGraph will be returned
   if hpoFile == None:
     return
   # helper function to analyse the lines
   def _analyseLines(self, lines):
     
     """ Analyse the parsed lines (helper function) """
     
     # file descriptor or hp term?
     if lines[0].startswith( "[Term]" ):
       # add a hpoterm by the hpoterms description
       for line in lines:
         # do nothing, if HpoTerm is_obsolete
         if line.startswith('is_obsolete:'):
           return
       term = HpoTerm( lines[1:] )
       self.hpoTermsDict.update( { term.id.split(" ")[0] : term } )
     else:
       for line in lines:
         # ok, get the position of the :
         attrName = line[:line.find(":")].strip()
         attrVal = line[line.find(":")+1:].strip()
         # now add this as attribute
         if hasattr(self, attrName):
           if isinstance(getattr(self, attrName), list):
             getattr(self, attrName).append( attrVal )
           else:
             setattr(self, attrName, [ getattr(self, attrName), attrVal ])
         else:
           setattr(self, attrName, attrVal)
   # ok, parse the lines in the file
   try:
     f = file( hpoFile, "r" )
     lines = []
     for line in f:
       # skip empty lines
       if line.strip() == "":
         continue
       # do something for non empty lines
       if line.startswith( "[Term]" ):
         _analyseLines(self, lines)
         lines = [ line ]
       else:
         lines.append(line)
     _analyseLines(self, lines)
     f.close()
   except Exception as e:
     out.writeError("Error parsing hpo file " + str( e.message ) + " " + str( e.args) )
   # good and now create the relation ship childrens
   for key in self.hpoTermsDict:
     node = self.hpoTermsDict[key]
     if hasattr(node, "is_a"):
       if isinstance(node.is_a, list):
         for element in node.is_a:
           self.hpoTermsDict[ element.split(" ")[0] ].childrens.append(key)
       else:
         self.hpoTermsDict[ node.is_a.split(" ")[0] ].childrens.append(key)
Example #12
0
    def predictSequence(args, hpoGraph, uni2hpoDict, name="Sequence", seq=""):
        # ok, do the whole thing
        try:
            # debug msg
            out.writeLog('Predict function for protein: id: "' + str(name) + '" sequence: "' + str(seq) + '"')

            # lookup resulst if available
            foundInLookUp, hits = False, []
            if args.lookupdb:
                out.writeLog("Checking for precalculated results!")
                # ok, load them
                f = open(args.lookupdb, "r")
                for line in f:
                    if line.strip() == name.strip():
                        # oh, cool, its precalculated
                        foundInLookUp = True
                    elif foundInLookUp and line.startswith("\t"):
                        # ok, this belongs to result, load it
                        m = re.search("\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)", line)
                        hits.append(
                            {
                                "method": m.group(1),
                                "hit_id": m.group(2),
                                "hit_value": float(m.group(3)),
                                "hit_from": int(m.group(4)),
                                "hit_to": int(m.group(5)),
                                "hit_order": bool(m.group(6)),
                            }
                        )
                    elif foundInLookUp:
                        break
                f.close()

            # ok, first of all, get similar sequences!
            if not foundInLookUp:
                out.writeLog("Check blast and hhblits for sequence orthologs!")
                blastResults = blast.Blast.localBlast(seq=seq, database=args.blastDbFile, minEVal=args.blastMinEVal)
                for hit in blastResults.hits:
                    out.writeDebug("Blast: found hit: " + str(hit))
                hhblitsResults = hhblits.HHBLITS.localHHBLITS(seq=str(seq), database=args.hhblitsDbFile)
                for hit in hhblitsResults.hits:
                    out.writeDebug("hhblits: found hit: " + str(hit))
                hits.extend(blastResults.hits)
                hits.extend(hhblitsResults.hits)

            # reduce hits if fast prediction
            if args.fast:
                out.writeLog("Reduce hits for faster prediction!")
                hitsTmp = sorted(hits, key=lambda t: t["hit_value"])
                hits = hitsTmp[:6]

            # now get the hpo-Identifiers for each similar sequence
            out.writeLog("uniprot ids ({}) 2 HPO Terms".format(len(hits)))
            for hit in hits:
                try:
                    # Do not output this, it might be some GB output
                    #          out.writeDebug("found hpoTerms for " + str( hit[ "hit_id" ] ) + ": " + str( uni2hpoDict[ hit[ "hit_id" ] ] ) )
                    hit.update({"hpoTerms": uni2hpoDict[hit["hit_id"]]})
                except KeyError:
                    out.writeWarning("MISSING HPO TERMS FOR HIT: " + str(hit))

            # build and merge trees
            out.writeLog("Build and merge tree for similar sequences!")
            graph, hit_id = hpoGraph.getHpoSubGraph(hpoGraph.getRoot()), 0
            for hit in hits:
                #        out.writeDebug("@blast merging: {}".format(hit))
                subtree = hpoGraph.getHpoSubGraph(hit["hpoTerms"], {hit_id: hit})
                hit_id += 1
                graph += subtree

            # do the prediciton
            out.writeLog("Run main prediction!")
            # init the predictor
            p = predictor.Predictor(args.neuronalNet)
            p.runprediction(seq, graph)
            # always accept the root
            for root in hpoGraph.getRoot():
                graph.getHpoTermById(root).accepted = 1

            # do the output
            out.writeLog("writing output")
            for node in graph.getAcceptedNodes(args.minimalConfidence):
                out.writeOutput("{}\t{}\t{}".format(name, node.id, "%.*f" % (2, (node.accepted + 2) / 4)))

            # svg image desired?
            if args.createSvgImage:
                out.writeLog("Create a svg image showing all results!")
                if graph != None:
                    graph.writeSvgImage(fileName=str(name) + ".svg")
                else:
                    out.writeWarning("Can't create a svg image from an empty tree!")

            # clear attrs from all tree nodes, so that these don't interfere with later predictions
        #      out.writeLog("Clear memory for next prediction")
        #      hpoGraph.clearAttr()

        except Exception as err:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            out.writeError("Predicting Error: " + str(err) + " on line: " + str(exc_tb.tb_lineno))
            exit(1)
        pass