def direct_match(db_file_name,patterns,max_errors): try: num_patterns = len(patterns) except TypeError: error = "Error: Invalid type of pattern data" raise Exception(error) if type(max_errors) is type([1]): for elm in max_errors: if elm>8: print "An element of max errors is too high, changing it to 8" elm=8 if elm<1: print "An element of max errors is too low, changing it to 1" elm=1 else: if type(max_errors) is type(1): if max_errors>8: print "The maximum error rate is to high, changing it to 8" max_errors=8 if max_errors<1: print "The maximum error rate is too low, changing it to 1" max_errors=1 errors = max_errors max_errors = [] for i in range(num_patterns): max_errors.append(errors) else: error = "Error: Invalid type for number of errors" raise Exception(error) #check at least one query has been specified if num_patterns<1: error = "Error: At least one pattern to find must be specified" raise Exception(error) matchers = [] #deal with a single query specified as a string rather than a list patterns_to_match = [] if type(patterns)==type("abc"): patterns_to_match.append(patterns) num_patterns=1 else: patterns_to_match=patterns #create a pattern object for each query i=0 for pattern in patterns_to_match: #if the pattern is too long simply abort, we don't want to deal #with overflow issues and there are better tools out there anyway #similarly if it less than 2 aa then dont proceed if not 1<len(pattern)<60: error = "Error: A query protein had a length outside the range supported by this program" raise Exception(error) matchers.append(compile(pattern,len(pattern),max_errors[i])) i+=1 #open the specified database for reading try: db_file = open(db_file_name, 'r') except IOError: error = "Error: Cannot open specified db file" raise Exception(error) #storage for database information rec = [] header = '' #storage for information about each match #this allows for later calculation of p values and #output of this information to the user results_record_headers = [] results_surrounding_text = [] num_match_recs = 0 #begin parsing database for line in db_file: #database body section if line[0] != '>': rec.append(line[:-1]) #database header else: #attempt to match with previous body section record = ''.join(rec) begin_rec = 0 for i in range(num_patterns): matches = agrepy(patterns_to_match[i], len(patterns_to_match[i]),\ record,len(record),1,matchers[i]) if matches: if begin_rec==0: begin_rec = 1 num_match_recs +=1 results_record_headers.append(split(header)[1]) pattern_matches = [] for elm in matches: #for some large strings on 64 bit platforms sagrepy can return invalid #match data, so remove this here rather than risk problems later if elm[0]<elm[1]: pattern_matches.append(record[elm[0]:elm[1]]) results_surrounding_text.append(pattern_matches) rec = [] header = line #attempt to match the final body section record = ''.join(rec) begin_rec = 0 for i in range(num_patterns): matches = agrepy(patterns_to_match[i],\ len(patterns_to_match[i]),record,len(record),1,matchers[i]) if matches: if begin_rec==0: begin_rec = 1 num_match_recs +=1 results_record_headers.append(split(header)[1]) pattern_matches = [] for elm in matches: #for some large strings on 64 bit platforms sagrepy can return invalid #match data, so remove this here rather than risk problems later if elm[0]<elm[1]: pattern_matches.append(record[elm[0]:elm[1]]) results_surrounding_text.append(pattern_matches) #end parsing database #now calculate p values and output results to the user outputResults(patterns_to_match,num_match_recs,results_record_headers,results_surrounding_text) db_file.close() #Finally return result headers to the caller in case #they wish to make use of them in some other way return results_record_headers
def match(self,patterns,max_errors): """ Parameters: patterns - a string (i.e. "MKFL") - a single list element (i.e. ["MKFL"]) - multiple list elements (i.e. ["LIL","MKFL","CLF"]) - grouped list elements (i.e. [["LIL","MKFL"],"CLF"]) The first two cases are equivalent, and will result in a search for all records that contain the string/element Multiple list elements will result in a search for any records that contain ANY of the elements in the list Elements in a list can also be grouped together. If a group contains only a single element it will not have any effect, but specifying more than one element in a group has the effect that only records that contain ALL elements in a group will be searched for (but if there is more than one group a record needs to contain only one group to match, in other words elements within a group are ANDed together, and the groups are ORed). Also it is possible that false drops will return some subset of the patterns even if they are not explicitly searched for. The main advantage of grouping elements together like this is dramatically faster searches. max-errors - an integer - a list (i.e [2,3,1]) A single integer value will be applied to ALL patterns. If a seperate error rate is desired for each pattern then specify a list with 1 integer per pattern """ #--------------------------- #1. Check validity of inputs #--------------------------- #put patterns into uniform list format #first deal with a single query specified as a string rather than a list tmp_patterns = [] if type(patterns)==type("abc"): tmp_patterns.append(patterns) else: tmp_patterns=patterns #now pull out all groups, storing the info of each groups = [] patterns_to_match = [] i=0 for pat in tmp_patterns: if type(pat)==type([]): group = [] for elm in pat: patterns_to_match.append(elm) group.append(i) i+=1 groups.append(group) else: patterns_to_match.append(pat) i+=1 #add an int for patterns without groups if len(groups)==0: groups.append([-1]) #Finally get some data about this list try: num_patterns = len(patterns_to_match) except TypeError: error = "Error: Invalid type of pattern data" raise Exception(error) #and check at least one query has been specified if num_patterns<1: error = "Error: At least one pattern to find must be specified" raise Exception(error) #Now move on to checking max_errors if type(max_errors) is type([]): i=0 for elm in max_errors: if elm>8: print "An element of max errors is too high, changing it to 8" elm=8 if elm<1: print "An element of max errors is too low, changing it to 1" elm=1 i+=1 if i>num_patterns: error = "Error: More errors were specified than patterns given" raise Exception(error) if i<num_patterns: error = "Error: Less errors were specified than patterns given" raise Exception(error) else: if type(max_errors) is type(1): if max_errors>8: print "The maximum error rate is to high, changing it to 8" max_errors=8 if max_errors<1: print "The maximum error rate is too low, changing it to 1" max_errors=1 errors = max_errors max_errors = [] for i in range(num_patterns): max_errors.append(errors) else: error = "Error: Invalid type for number of errors" print error return error #------------------------ #2. Encode Query Patterns #------------------------ matchers = [] query_pieces = [] piece_sizes,segments = self.PIECE_SIZES,self.SEGMENTS i=0 for pattern in patterns_to_match: #create a pattern object for each query matchers.append(compile(pattern,len(pattern),max_errors[i])) #check the query length is within acceptable bounds pattern_length = len(pattern) #if the pattern is too long simply abort, we don't want to deal #with overflow issues and there are better tools out there anyway if pattern_length>self.MAX_QUERY_LENGTH: error = [] error.append("Error: This program currently cannot deal with ") error.append("proteins longer than ") error.append(str(self.MAX_QUERY_LENGTH)) error.append(" aa, sorry") error = ''.join(error) clearMatchInfo() raise Exception(error) #break the query into a number of segments equal to the #maximum number of errors for the query plus one num_segs = max_errors[i]+1 segs_size=pattern_length/num_segs #if the pattern is too short then abort all #preprocessed matching and give all input to direct #match to deal with and return to the user if segs_size<piece_sizes[0]: print "A query had too many errors for its size" print "Using direct_match instead..." clearMatchInfo() return direct_match(self.database_name,patterns,max_errors) #for each segment encode all pieces createSegments(num_segs) completed=0 for j in range(num_segs): #on the last one make size the leftover if j+1==num_segs: segs_size = pattern_length-(segs_size*(num_segs-1)) for elm in segments[segs_size]: encodeQueryPiece(j,pattern[completed:completed+elm],elm-2) completed+=elm for elm in groups: if i in elm: updateDropsetAnd() else: updateDropsetOr() i+=1 #------------------------- #3. Set up database access #------------------------- try: db_file = open(self.database_name, 'r') except IOError: error = "Error: Cannot open specified db file" raise Exception(error) #---------------------------------------------- #4. Search dropped database records for matches #---------------------------------------------- #storage for information about each match #this allows for later calculation of p values and #output of this information to the user results_record_headers = [] results_surrounding_text = [] num_match_recs = 0 #handles for called functions index = self.header_index db_pos = db_file.seek #start parsing dropfile rec_num=nextDrop(1) while(rec_num): db_pos(index[rec_num-1]) title = db_file.readline().split()[1] rec = [] line = db_file.readline() while(line and line[0]!='>'): rec.append(line[:-1]) line = db_file.readline() record = ''.join(rec) begin_rec = 0 for i in range(num_patterns): matches = agrepy(patterns_to_match[i], len(patterns_to_match[i]),\ record,len(record),1,matchers[i]) if matches: if begin_rec==0: begin_rec = 1 num_match_recs +=1 results_record_headers.append(title) pattern_matches = [] for elm in matches: start=0 if elm[0]>0: start = elm[0] pattern_matches.append(record[start:elm[1]]) results_surrounding_text.append(pattern_matches) rec_num=nextDrop(rec_num+1) clearMatchInfo() db_file.close() #-------------------- #5. Deal with results #-------------------- outputResults(patterns_to_match,num_match_recs,results_record_headers, results_surrounding_text) #Finally return result headers to the caller in case #they wish to make use of them in some other way return results_record_headers