def main(): parser=argparse.ArgumentParser(description='Description',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='Hi-C interaction matrix',dest='infile',type=str,required=True) parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True) parser.add_argument('-cv',help='evaluate by cross validation',dest='cv',action='store_true') parser.add_argument('-p',help='predict chromosome of unplace contigs',dest='predict_unplaced',action='store_true') parser.add_argument('-v',help='List of leave-out half-window sizes for CV (in bps)',dest='v_list',nargs='+',type=float,default=[0,0.5e6,1e6,2e6,5e6,10e6]) parser.add_argument('-x',help='excluded chrs',dest='excluded_chrs',nargs='+',type=str,default=['chrM','chrY']) parser.add_argument('-pc',help='placed chrs',dest='placed_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX']) args=parser.parse_args() infile=args.infile outfile=args.outfile cv=args.cv predict_unplaced=args.predict_unplaced eval_on_train=args.eval_on_train v_list=args.v_list excluded_chrs=args.excluded_chrs placed_chrs=args.placed_chrs sys.stderr.write("Loading data\n") d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=placed_chrs) bin_mean_position=np.mean(bin_position,1) chrs=np.unique(bin_chr) n=d.shape[0] d[np.diag_indices(n)]=0 if cv: sys.stderr.write("Evaluating in cross-validation\n") d_sum=triangulation.func_reduce(d,bin_chr,func=np.sum).T for v in v_list: sys.stderr.write("leaving out bins within "+str(v)+" bps\n") predicted_chr=[] predicted_prob=[] for i in np.arange(n): eps=1e-8 proximal_bins = (bin_chr==bin_chr[i]) & (bin_mean_position>=bin_mean_position[i]-v-eps) & (bin_mean_position<=bin_mean_position[i]+v+eps) train_vectors=d_sum.copy() train_vectors-=triangulation.func_reduce(d[proximal_bins,:],bin_chr[proximal_bins],func=np.sum,allkeys=chrs).T train_vectors/=triangulation.func_reduce(np.ones(len(~proximal_bins)),bin_chr[~proximal_bins],func=np.sum,allkeys=chrs).T train_vectors=train_vectors[~proximal_bins,:] train_labels=bin_chr[~proximal_bins] model=triangulation.AugmentationChrPredModel() model.fit(train_vectors,train_labels) test_d=d[i,~proximal_bins] test_bin_chr=bin_chr[~proximal_bins] test_vector=triangulation.average_reduce(test_d,test_bin_chr) pred_chr,pred_prob=model.predict(test_vector) predicted_chr.append(pred_chr[0]) predicted_prob.append(pred_prob[0]) predicted_chr=np.array(predicted_chr) predicted_prob=np.array(predicted_prob) np.savetxt(outfile+'_cvpred_v'+str(v)+'.tab',[bin_chr,bin_position,predicted_chr,predicted_prob],fmt='%s',delimiter='\t') if predict_unplaced: sys.stderr.write("predicting chromosome of unplaced contigs\n") # train on all data (without diagonal) model=triangulation.AugmentationChrPredModel() d_avg=triangulation.average_reduce(d,bin_chr).T model.fit(d_avg,bin_chr) d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True) chrs=np.unique(bin_chr) unplaced_chrs=np.unique((set(bin_chr)-set(placed_chrs))-set(excluded_chrs)) unplaced_chr_bins=np.any(bin_chr[None].T==unplaced_chrs,1) d=d[unplaced_chr_bins,:] d_avg=triangulation.average_reduce(d.T,bin_chr).T d_avg=d_avg[:,np.any(chrs[None].T==np.array(placed_chrs),1)] pred_pos,pred_prob=model.predict(d_avg) res=np.c_[bin_chr[unplaced_chr_bins],bin_position[unplaced_chr_bins,:].astype(int),pred_pos,pred_prob] np.savetxt(outfile+'_predictions.tab',res,fmt='%s',delimiter='\t')
def main(): parser = argparse.ArgumentParser( description='Description', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='Hi-C interaction matrix', dest='infile', type=str, required=True) parser.add_argument('-out', help='prefix for output files', dest='outfile', type=str, required=True) parser.add_argument('-cv', help='evaluate by cross validation', dest='cv', action='store_true') parser.add_argument('-p', help='predict chromosome of unplace contigs', dest='predict_unplaced', action='store_true') parser.add_argument( '-v', help='List of leave-out half-window sizes for CV (in bps)', dest='v_list', nargs='+', type=float, default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6]) parser.add_argument('-x', help='excluded chrs', dest='excluded_chrs', nargs='+', type=str, default=['chrM', 'chrY']) parser.add_argument('-pc', help='placed chrs', dest='placed_chrs', nargs='+', type=str, default=[ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX' ]) args = parser.parse_args() infile = args.infile outfile = args.outfile cv = args.cv predict_unplaced = args.predict_unplaced eval_on_train = args.eval_on_train v_list = args.v_list excluded_chrs = args.excluded_chrs placed_chrs = args.placed_chrs sys.stderr.write("Loading data\n") d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True, chrs=placed_chrs) bin_mean_position = np.mean(bin_position, 1) chrs = np.unique(bin_chr) n = d.shape[0] d[np.diag_indices(n)] = 0 if cv: sys.stderr.write("Evaluating in cross-validation\n") d_sum = triangulation.func_reduce(d, bin_chr, func=np.sum).T for v in v_list: sys.stderr.write("leaving out bins within " + str(v) + " bps\n") predicted_chr = [] predicted_prob = [] for i in np.arange(n): eps = 1e-8 proximal_bins = (bin_chr == bin_chr[i]) & ( bin_mean_position >= bin_mean_position[i] - v - eps) & ( bin_mean_position <= bin_mean_position[i] + v + eps) train_vectors = d_sum.copy() train_vectors -= triangulation.func_reduce( d[proximal_bins, :], bin_chr[proximal_bins], func=np.sum, allkeys=chrs).T train_vectors /= triangulation.func_reduce( np.ones(len(~proximal_bins)), bin_chr[~proximal_bins], func=np.sum, allkeys=chrs).T train_vectors = train_vectors[~proximal_bins, :] train_labels = bin_chr[~proximal_bins] model = triangulation.AugmentationChrPredModel() model.fit(train_vectors, train_labels) test_d = d[i, ~proximal_bins] test_bin_chr = bin_chr[~proximal_bins] test_vector = triangulation.average_reduce( test_d, test_bin_chr) pred_chr, pred_prob = model.predict(test_vector) predicted_chr.append(pred_chr[0]) predicted_prob.append(pred_prob[0]) predicted_chr = np.array(predicted_chr) predicted_prob = np.array(predicted_prob) np.savetxt(outfile + '_cvpred_v' + str(v) + '.tab', [bin_chr, bin_position, predicted_chr, predicted_prob], fmt='%s', delimiter='\t') if predict_unplaced: sys.stderr.write("predicting chromosome of unplaced contigs\n") # train on all data (without diagonal) model = triangulation.AugmentationChrPredModel() d_avg = triangulation.average_reduce(d, bin_chr).T model.fit(d_avg, bin_chr) d, bin_chr, bin_position = triangulation.load_data_txt( infile, remove_nans=True) chrs = np.unique(bin_chr) unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) - set(excluded_chrs)) unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1) d = d[unplaced_chr_bins, :] d_avg = triangulation.average_reduce(d.T, bin_chr).T d_avg = d_avg[:, np.any(chrs[None].T == np.array(placed_chrs), 1)] pred_pos, pred_prob = model.predict(d_avg) res = np.c_[bin_chr[unplaced_chr_bins], bin_position[unplaced_chr_bins, :].astype(int), pred_pos, pred_prob] np.savetxt(outfile + '_predictions.tab', res, fmt='%s', delimiter='\t')
def main(): parser = argparse.ArgumentParser( description= 'Scaffold chromosome de novo from contig interaction matrix.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='interaction frequency matrix file', dest='in_file', type=str, required=True) parser.add_argument('-out', help='out file prefix', dest='out_file', type=str, required=True) parser.add_argument('-it', help='number of times to rerun L-BFGS', dest='iterations', type=int, default=1) parser.add_argument('-p', help='number of processors to use', dest='pnum', type=int, default=0) parser.add_argument('-seed', help='seed for L-BFGS init', dest='init_seed', type=int, default=0) parser.add_argument('-shuffle_seed', help='seed for shuffle', dest='shuffle_seed', type=int, default=0) parser.add_argument( '-realpos', help= 'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"', dest='realposfile', type=str, default=None) parser.add_argument( '-best', help='sort by original positions to estimate best solution', dest='sort_by_realpos', action='store_true') parser.add_argument( '-drop', help= 'leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset', dest='drop', type=int, default=1) parser.add_argument( '-keep_unreal', help='keep contigs for which real position is not known', dest='keep_unreal', action='store_true') parser.add_argument('-lbfgs_pgtol', help='pgtol for lbfgs', dest='lbfgs_pgtol', type=float, default=1e-9) parser.add_argument('-lbfgs_factr', help='factr for lbfgs', dest='lbfgs_factr', type=float, default=1e4) parser.add_argument('-lbfgs_show', help='show lbfgs iterations (only with pnum = 1)', dest='lbfgs_show', action='store_true') args = parser.parse_args() in_file = args.in_file out_file = args.out_file pnum = args.pnum iterations = args.iterations init_seed = args.init_seed shuffle_seed = args.shuffle_seed sort_by_realpos = args.sort_by_realpos drop = args.drop lbfgs_pgtol = args.lbfgs_pgtol lbfgs_factr = args.lbfgs_factr lbfgs_show = args.lbfgs_show realposfile = args.realposfile keep_unreal = args.keep_unreal logger("loading interactions from %s ..." % in_file) chrs = [] #'ENA|CP002684|CP002684.1',] d, bin_chr, bin_position = tr.load_data_txt(in_file, retain=drop, remove_nans=True, rename=True, chrs=chrs) logger(" loaded matrix with %s contigs." % d.shape[0]) if realposfile != None: logger("loading real positions from %s ..." % realposfile) contig_pos_dict = {} with open(realposfile, "r") as fh: for line in fh: c_name, c_start, c_end = line.rstrip("\n").split("\t") contig_pos_dict[c_name] = (float(c_start), float(c_end)) realpos = np.array( [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr]) realpos = realpos[:, 0] + np.mean(bin_position, 1) if not keep_unreal: logger("removing contigs without real positions...") relevant = ~np.isnan(realpos) realpos = realpos[relevant] d = d[relevant, :][:, relevant] bin_chr = bin_chr[relevant] logger(" %s contigs left." % d.shape[0]) # average contigs that share the same id logger("averaging contigs that share the same id...") d = tr.average_reduce_2d(d, bin_chr) if realposfile != None: realpos = tr.average_reduce(realpos, bin_chr) bin_chr = np.unique(bin_chr) logger(" %s contigs left." % d.shape[0]) shuffle = True if (sort_by_realpos): if realposfile == None: sys.exit('-best requires -realpos') if np.any(np.isnan(realpos)): sys.exit( '-best requires real positions to be given for ALL contigs') rr = np.argsort(realpos) realpos = realpos[rr] d = d[rr, :][:, rr] bin_chr = bin_chr[rr] shuffle = False logger("scaffolding %s contigs ..." % d.shape[0]) logger(" running %s optimisations in %s threads ..." % (iterations, pnum)) scales, pos, x0, fvals = tr.assemble_chromosome(d, pnum, iterations, shuffle, shuffle_seed, init_seed, return_all=True, log_data=True, lbfgs_factr=lbfgs_factr, lbfgs_pgtol=lbfgs_pgtol, approx_grad=False, lbfgs_show=lbfgs_show) logger("saving results ...") if realposfile != None: print pos np.savetxt(out_file + '_predpos.tab', np.rec.fromarrays([bin_chr, realpos, pos[0, :]]), fmt='%s', delimiter='\t') # plot plt.plot(realpos, pos[0, :], 'b.') plt.xlabel("Expected position") plt.ylabel("Predicted position") plt.savefig(out_file + '_predpos.png') else: np.savetxt(out_file + '_predpos.tab', np.rec.fromarrays([bin_chr, pos[0, :]]), fmt='%s', delimiter='\t') np.savetxt(out_file + '_pos_all.tab', pos, fmt='%s', delimiter='\t') np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t') np.savetxt(out_file + '_fvals_all.tab', fvals, fmt='%s', delimiter='\t') np.savetxt(out_file + '_scales_all.tab', scales, fmt='%s', delimiter='\t') logger(" done.")
def main(): parser=argparse.ArgumentParser(description='Scaffold chromosome de novo from contig interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True) parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True) parser.add_argument('-it',help='number of times to rerun L-BFGS',dest='iterations',type=int,default=1) parser.add_argument('-p',help='number of processors to use',dest='pnum',type=int,default=0) parser.add_argument('-seed',help='seed for L-BFGS init',dest='init_seed',type=int,default=0) parser.add_argument('-shuffle_seed',help='seed for shuffle',dest='shuffle_seed',type=int,default=0) parser.add_argument('-realpos',help='file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',dest='realposfile',type=str,default=None) parser.add_argument('-best',help='sort by original positions to estimate best solution',dest='sort_by_realpos',action='store_true') parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset',dest='drop',type=int,default=1) parser.add_argument('-keep_unreal',help='keep contigs for which real position is not known',dest='keep_unreal',action='store_true') parser.add_argument('-lbfgs_pgtol',help='pgtol for lbfgs',dest='lbfgs_pgtol',type=float,default=1e-9) parser.add_argument('-lbfgs_factr',help='factr for lbfgs',dest='lbfgs_factr',type=float,default=1e4) parser.add_argument('-lbfgs_show',help='show lbfgs iterations (only with pnum=1)',dest='lbfgs_show',action='store_true') args=parser.parse_args() in_file=args.in_file out_file=args.out_file pnum=args.pnum iterations=args.iterations init_seed=args.init_seed shuffle_seed=args.shuffle_seed sort_by_realpos=args.sort_by_realpos drop=args.drop lbfgs_pgtol=args.lbfgs_pgtol lbfgs_factr=args.lbfgs_factr lbfgs_show=args.lbfgs_show realposfile=args.realposfile keep_unreal=args.keep_unreal sys.stderr.write("loading interactions from "+in_file+" ...\n") d,bin_chr,bin_position=triangulation.load_data_txt(in_file,retain=drop,remove_nans=True) sys.stderr.write("loaded matrix with "+str(d.shape[0])+" contigs.\n") if realposfile!=None: sys.stderr.write("loading real positions from "+realposfile+" ...\n") contig_pos_dict={} with open(realposfile,"r") as fh: for line in fh: c_name,c_start,c_end=line.rstrip("\n").split("\t") contig_pos_dict[c_name] = (float(c_start),float(c_end)) realpos=np.array([contig_pos_dict.get(i,(np.nan,np.nan)) for i in bin_chr]) realpos=realpos[:,0]+np.mean(bin_position,1) if not keep_unreal: sys.stderr.write("removing contigs without real positions...\n") relevant = ~np.isnan(realpos) realpos=realpos[relevant] d=d[relevant,:][:,relevant] bin_chr=bin_chr[relevant] sys.stderr.write(str(d.shape[0])+" contigs left.\n") # average contigs that share the same id sys.stderr.write("averaging contigs that share the same id...\n") d=triangulation.average_reduce_2d(d,bin_chr) if realposfile!=None: realpos=triangulation.average_reduce(realpos,bin_chr) bin_chr=np.unique(bin_chr) sys.stderr.write(str(d.shape[0])+" contigs left.\n") shuffle=True if (sort_by_realpos): if realposfile==None: sys.exit('-best requires -realpos') if np.any(np.isnan(realpos)): sys.exit('-best requires real positions to be given for ALL contigs') rr=np.argsort(realpos) realpos=realpos[rr] d=d[rr,:][:,rr] bin_chr=bin_chr[rr] shuffle=False sys.stderr.write("scaffolding "+str(d.shape[0])+" contigs ...\n") scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show) sys.stderr.write("saving results ...\n") if realposfile!=None: np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,realpos,pos[0,:]]),fmt='%s',delimiter='\t') else: np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,pos[0,:]]),fmt='%s',delimiter='\t') np.savetxt(out_file+'_pos_all.tab',pos,fmt='%s',delimiter='\t') np.savetxt(out_file+'_x0_all.tab',x0,fmt='%s',delimiter='\t') np.savetxt(out_file+'_fvals_all.tab',fvals,fmt='%s',delimiter='\t') np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t') sys.stderr.write("done.\n")