def main( data_df,model_df, start=0,end=None,err=False): dicttype, modeltype = qc.get_model_type(model_df) seq_cols = qc.get_cols_from_df(data_df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end) if modeltype=='MAT': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']): raise SortSeqError('model length does not match dataset length') elif modeltype=='NBR': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1: raise SortSeqError('model length does not match dataset length') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) data_df = data_df[data_df.ct != 0] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end-start data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] #make a numpy array out of the model data frame model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))] value = np.transpose(np.array(model_df[model_df_headers])) #now we evaluate the expression of each sequence according to the model. seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype) temp_df = data_df.copy() temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True,drop=True) #we must divide by the total number of counts in each bin for the MI calculator #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0) MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=0) if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2)) sub_df.reset_index(inplace=True,drop=True) sub_MI[i],sub_std = main( sub_df,model_df,modeltype=modeltype,err=False) Std = np.std(sub_MI)/np.sqrt(2) return MI,Std
def main(dataset_df,model_df,left=None,right=None): # Validate dataframes qc.validate_dataset(dataset_df) qc.validate_model(model_df) # Detect model type based on columns seqtype, modeltype = qc.get_model_type(model_df) seqcol = qc.seqtype_to_seqcolname_dict[seqtype] # Set start and end based on left or right if not ((left is None) or (right is None)): raise SortSeqError('Cannot set both left and right at same time.') if not (left is None): start = left end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0) elif not (right is None): end = right start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0) else: start = model_df['pos'].values[0] end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1) assert start < end # Validate start and end positions seq_length = len(dataset_df[seqcol][0]) if start < 0: raise SortSeqError('Invalid start=%d'%start) if end > seq_length: raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length)) #select target sequence region out_df = dataset_df.copy() out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end) #Create model object of correct type if modeltype == 'MAT': mymodel = Models.LinearModel(model_df) elif modeltype == 'NBR': mymodel = Models.NeighborModel(model_df) else: raise SortSeqError('Unrecognized model type %s'%modeltype) # Compute values out_df['val'] = mymodel.evaluate(out_df) # Validate dataframe and return return qc.validate_dataset(out_df,fix=True)
def __init__(self,model_df): """ Constructor takes model parameters in the form of a model dataframe """ model_df = qc.validate_model(model_df.copy(),fix=True) seqtype, modeltype = qc.get_model_type(model_df) if not modeltype=='NBR': raise SortSeqError('Invalid modeltype: %s'%modeltype) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) self.seqtype = seqtype self.seq_dict = seq_dict self.inv_dict = inv_dict self.df = model_df self.length = model_df.shape[0]+1 # Extract matrix part of model dataframe headers = qc.get_cols_from_df(model_df,'vals') self.matrix = np.transpose(np.array(model_df[headers]))
def main(model_df, contig_list, numsites=10, verbose=False): # Determine type of string from model qc.validate_model(model_df) seqtype, modeltype = qc.get_model_type(model_df) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) # Check that all characters are from the correct alphabet alphabet = qc.seqtype_to_alphabet_dict[seqtype] search_string = r"[^%s]"%alphabet for contig_str, contig_name, pos_offset in contig_list: if re.search(search_string,contig_str): raise SortSeqError(\ 'Invalid character for seqtype %s found in %s.'%\ (seqtype,contig_name)) # Create model object to evaluate on seqs if modeltype == 'MAT': model_obj = Models.LinearModel(model_df) elif modeltype == 'NBR': model_obj = Models.NeighborModel(model_df) # Create list of dataframes, one for each contig seq_col = qc.seqtype_to_seqcolname_dict[seqtype] L = model_obj.length sitelist_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) for contig_str, contig_name, pos_offset in contig_list: if len(contig_str) < L: continue this_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) num_sites = len(contig_str) - L + 1 poss = np.arange(num_sites).astype(int) this_df['left'] = poss + pos_offset this_df['right'] = poss + pos_offset + L - 1 #this_df[seq_col] = [contig_str[i:(i+L)] for i in poss] this_df[seq_col] = fast.seq2sitelist(contig_str,L) #Cython this_df['ori'] = '+' this_df['contig'] = contig_name this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # If scanning DNA, scan reverse-complement as well if seqtype=='dna': #this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]] this_df[seq_col] = fast.seq2sitelist(contig_str,L,rc=True) #Cython this_df['ori'] = '-' this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # Sort by value and reindex sitelist_df.sort_values(by='val', ascending=False, inplace=True) sitelist_df.reset_index(drop=True,inplace=True) # Crop list at numsites if sitelist_df.shape[0]>numsites: sitelist_df.drop(sitelist_df.index[numsites:], inplace=True) if verbose: print '.', sys.stdout.flush() if verbose: print '' sys.stdout.flush() # If no sites were found, raise error if sitelist_df.shape[0]==0: raise SortSeqError(\ 'No full-length sites found within provided contigs.') sitelist_df = qc.validate_sitelist(sitelist_df,fix=True) return sitelist_df
def wrapper(args): """ Wrapper for function for scan_model.main() """ # Prepare input to main model_df = io.load_model(args.model) seqtype, modeltype = qc.get_model_type(model_df) L = model_df.shape[0] if modeltype=='NBR': L += 1 chunksize = args.chunksize if not chunksize>0: raise SortSeqError(\ 'chunksize=%d must be positive'%chunksize) if args.numsites <= 0: raise SortSeqError('numsites=%d must be positive.'%args.numsites) if args.i and args.seq: raise SortSeqError('Cannot use flags -i and -s simultaneously.') # If sequence is provided manually if args.seq: pos_offset=0 contig_str = args.seq # Add a bit on end if circular if args.circular: contig_str += contig_str[:L-1] contig_list = [(contig_str,'manual',pos_offset)] # Otherwise, read sequence from FASTA file else: contig_list = [] inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin for i,record in enumerate(SeqIO.parse(inloc,'fasta')): name = record.name if record.name else 'contig_%d'%i # Split contig up into chunk)size bits full_contig_str = str(record.seq) # Add a bit on end if circular if args.circular: full_contig_str += full_contig_str[:L-1] # Define chunks containing chunksize sites start = 0 end = start+chunksize+L-1 while end < len(full_contig_str): contig_str = full_contig_str[start:end] contig_list.append((contig_str,name,start)) start += chunksize end = start+chunksize+L-1 contig_str = full_contig_str[start:] contig_list.append((contig_str,name,start)) if len(contig_list)==0: raise SortSeqError('No input sequences to read.') # Compute results outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout output_df = main(model_df,contig_list,numsites=args.numsites,\ verbose=args.verbose) # Write df to stdout or to outfile io.write(output_df,outloc,fast=args.fast)