Beispiel #1
0
def direct_match(db_file_name,patterns,max_errors):
    try:
        num_patterns = len(patterns)
    except TypeError:
        error = "Error:  Invalid type of pattern data"
        raise Exception(error) 
    if type(max_errors) is type([1]):
        for elm in max_errors:
            if elm>8:
                print "An element of max errors is too high, changing it to 8"
                elm=8
            if elm<1:
                print "An element of max errors is too low, changing it to 1"
                elm=1
    else:
        if type(max_errors) is type(1):
            if max_errors>8:
                print "The maximum error rate is to high, changing it to 8"
                max_errors=8
            if max_errors<1:
                print "The maximum error rate is too low, changing it to 1"
                max_errors=1
            errors = max_errors
            max_errors = []
            for i in range(num_patterns):
                max_errors.append(errors)  
        else:
            error = "Error:  Invalid type for number of errors"
            raise Exception(error) 
    #check at least one query has been specified
    if num_patterns<1:
        error = "Error: At least one pattern to find must be specified"
        raise Exception(error)
    matchers = []
    #deal with a single query specified as a string rather than a list
    patterns_to_match = []
    if type(patterns)==type("abc"):
        patterns_to_match.append(patterns)
        num_patterns=1
    else:
        patterns_to_match=patterns
    #create a pattern object for each query
    i=0
    for pattern in patterns_to_match:
        #if the pattern is too long simply abort, we don't want to deal
        #with overflow issues and there are better tools out there anyway
        #similarly if it less than 2 aa then dont proceed
        if not 1<len(pattern)<60:
            error = "Error: A query protein had a length outside the range supported by this program"
            raise Exception(error)
        matchers.append(compile(pattern,len(pattern),max_errors[i]))
        i+=1
    #open the specified database for reading    
    try:
        db_file = open(db_file_name, 'r')
    except IOError:
        error = "Error: Cannot open specified db file"
        raise Exception(error)     
    #storage for database information 
    rec = []
    header = ''
    #storage for information about each match
    #this allows for later calculation of p values and
    #output of this information to the user
    results_record_headers = []
    results_surrounding_text = []
    num_match_recs = 0
    #begin parsing database 
    for line in db_file:           
        #database body section
        if line[0] != '>':
            rec.append(line[:-1])               
        #database header
        else:  
            #attempt to match with previous body section
            record = ''.join(rec)
            begin_rec = 0
            for i in range(num_patterns):
                matches = agrepy(patterns_to_match[i], len(patterns_to_match[i]),\
                                 record,len(record),1,matchers[i])
                if matches:
                    if begin_rec==0:
                        begin_rec = 1
                        num_match_recs +=1
                    results_record_headers.append(split(header)[1])
                    pattern_matches = []
                    for elm in matches:
                        #for some large strings on 64 bit platforms sagrepy can return invalid
                        #match data, so remove this here rather than risk problems later
                        if elm[0]<elm[1]:
                            pattern_matches.append(record[elm[0]:elm[1]]) 
                    results_surrounding_text.append(pattern_matches)                  
            rec = []
            header = line
    #attempt to match the final body section
    record = ''.join(rec)
    begin_rec = 0
    for i in range(num_patterns):
        matches = agrepy(patterns_to_match[i],\
                    len(patterns_to_match[i]),record,len(record),1,matchers[i])
        if matches:
            if begin_rec==0:
                begin_rec = 1
                num_match_recs +=1
            results_record_headers.append(split(header)[1])
            pattern_matches = []
            for elm in matches:
                #for some large strings on 64 bit platforms sagrepy can return invalid
                #match data, so remove this here rather than risk problems later
                if elm[0]<elm[1]:
                    pattern_matches.append(record[elm[0]:elm[1]])  
            results_surrounding_text.append(pattern_matches)  
    #end parsing database
    #now calculate p values and output results to the user
    outputResults(patterns_to_match,num_match_recs,results_record_headers,results_surrounding_text)
    db_file.close()
    #Finally return result headers to the caller in case
    #they wish to make use of them in some other way
    return results_record_headers  
Beispiel #2
0
    def match(self,patterns,max_errors):
        """ Parameters:
            
            patterns - a string (i.e. "MKFL")
                    - a single list element (i.e. ["MKFL"])
                    - multiple list elements (i.e. ["LIL","MKFL","CLF"])
                    - grouped list elements (i.e.  [["LIL","MKFL"],"CLF"])
                                            
                    The first two cases are equivalent, and will result in
                    a search for all records that contain the string/element
                    
                    Multiple list elements will result in a search for any
                    records that contain ANY of the elements in the list
                    
                    Elements in a list can also be grouped together. If a
                    group contains only a single element it will not have
                    any effect, but specifying more than one element in a
                    group has the effect that only records that contain ALL
                    elements in a group will be searched for (but if there 
                    is more than one group a record needs to contain only one
                    group to match, in other words elements within a group
                    are ANDed together, and the groups are ORed). Also it
                    is possible that false drops will return some subset
                    of the patterns even if they are not explicitly
                    searched for. The main advantage of grouping elements 
                    together like this is dramatically faster searches.
                    
        max-errors - an integer
                    - a list (i.e [2,3,1])
                    
                    A single integer value will be applied to ALL patterns.
                    
                    If a seperate error rate is desired for each pattern then
                    specify a list with 1 integer per pattern

        """    
        #---------------------------
        #1. Check validity of inputs
        #---------------------------
        #put patterns into uniform list format
        #first deal with a single query specified as a string rather than a list
        tmp_patterns = []
        if type(patterns)==type("abc"):
            tmp_patterns.append(patterns)
        else:
            tmp_patterns=patterns
        #now pull out all groups, storing the info of each
        groups = []   
        patterns_to_match = []
        i=0
        for pat in tmp_patterns:
            if type(pat)==type([]):
                group = []
                for elm in pat:
                    patterns_to_match.append(elm)
                    group.append(i)
                    i+=1
                groups.append(group)  
            else:
                patterns_to_match.append(pat)
                i+=1   
        #add an int for patterns without groups
        if len(groups)==0:
            groups.append([-1])
        #Finally get some data about this list 
        try:
            num_patterns = len(patterns_to_match)
        except TypeError:
            error = "Error:  Invalid type of pattern data"
            raise Exception(error)
        #and check at least one query has been specified
        if num_patterns<1:
            error = "Error:  At least one pattern to find must be specified"
            raise Exception(error)   
        #Now move on to checking max_errors    
        if type(max_errors) is type([]):
            i=0
            for elm in max_errors:
                if elm>8:
                    print "An element of max errors is too high, changing it to 8"
                    elm=8
                if elm<1:
                    print "An element of max errors is too low, changing it to 1"
                    elm=1
                i+=1
            if i>num_patterns:
                error = "Error:  More errors were specified than patterns given"
                raise Exception(error)
            if i<num_patterns:
                error = "Error:  Less errors were specified than patterns given"
                raise Exception(error)
        else:
            if type(max_errors) is type(1):
                if max_errors>8:
                    print "The maximum error rate is to high, changing it to 8"
                    max_errors=8
                if max_errors<1:
                    print "The maximum error rate is too low, changing it to 1"
                    max_errors=1
                errors = max_errors
                max_errors = []
                for i in range(num_patterns):
                    max_errors.append(errors)   
            else:
                error = "Error:  Invalid type for number of errors"
                print error
                return error 

        #------------------------
        #2. Encode Query Patterns
        #------------------------
        matchers = []
        query_pieces = []
        piece_sizes,segments = self.PIECE_SIZES,self.SEGMENTS
        i=0
        for pattern in patterns_to_match:
            #create a pattern object for each query
            matchers.append(compile(pattern,len(pattern),max_errors[i]))
            #check the query length is within acceptable bounds
            pattern_length = len(pattern)      
            #if the pattern is too long simply abort, we don't want to deal
            #with overflow issues and there are better tools out there anyway
            if pattern_length>self.MAX_QUERY_LENGTH:
                error = []
                error.append("Error: This program currently cannot deal with ")
                error.append("proteins longer than ")
                error.append(str(self.MAX_QUERY_LENGTH))
                error.append(" aa, sorry")
                error = ''.join(error)
                clearMatchInfo()
                raise Exception(error)
            #break the query into a number of segments equal to the 
            #maximum number of errors for the query plus one
            num_segs = max_errors[i]+1
            segs_size=pattern_length/num_segs
            #if the pattern is too short then abort all 
            #preprocessed matching and give all input to direct
            #match to deal with and return to the user
            if segs_size<piece_sizes[0]:
                print "A query had too many errors for its size"
                print "Using direct_match instead..."
                clearMatchInfo()
                return direct_match(self.database_name,patterns,max_errors)
            #for each segment encode all pieces
            createSegments(num_segs)
            completed=0
            for j in range(num_segs):
                #on the last one make size the leftover
                if j+1==num_segs:
                    segs_size = pattern_length-(segs_size*(num_segs-1))
                for elm in segments[segs_size]:
                    encodeQueryPiece(j,pattern[completed:completed+elm],elm-2)
                    completed+=elm
            for elm in groups:
                if i in elm:
                    updateDropsetAnd()
                else:
                    updateDropsetOr()
            i+=1

        #-------------------------
        #3. Set up database access
        #-------------------------  
        try:
            db_file = open(self.database_name, 'r')
        except IOError:
            error = "Error: Cannot open specified db file"
            raise Exception(error)  
            
        #----------------------------------------------
        #4. Search dropped database records for matches
        #----------------------------------------------   
        #storage for information about each match
        #this allows for later calculation of p values and
        #output of this information to the user
        results_record_headers = []
        results_surrounding_text = []
        num_match_recs = 0
        #handles for called functions
        index = self.header_index
        db_pos = db_file.seek
        #start parsing dropfile
        rec_num=nextDrop(1)
        while(rec_num):  
            db_pos(index[rec_num-1])
            title =  db_file.readline().split()[1]
            rec = []
            line = db_file.readline()
            while(line and line[0]!='>'):
                rec.append(line[:-1])
                line = db_file.readline()
            record = ''.join(rec)
            begin_rec = 0
            for i in range(num_patterns):
                matches = agrepy(patterns_to_match[i], len(patterns_to_match[i]),\
                                 record,len(record),1,matchers[i])
                if matches:
                    if begin_rec==0:
                        begin_rec = 1
                        num_match_recs +=1
                    results_record_headers.append(title)
                    pattern_matches = []
                    for elm in matches:
                        start=0
                        if elm[0]>0:
                            start = elm[0]
                        pattern_matches.append(record[start:elm[1]]) 
                    results_surrounding_text.append(pattern_matches) 
            rec_num=nextDrop(rec_num+1)
        clearMatchInfo()      
        db_file.close()
        
        #--------------------
        #5. Deal with results
        #-------------------- 
        outputResults(patterns_to_match,num_match_recs,results_record_headers,
                      results_surrounding_text)
        #Finally return result headers to the caller in case
        #they wish to make use of them in some other way
        return results_record_headers