def create_fastq_length_histogram(fastqFile,logger=None): SUB='create_fastq_length_histogram' fastq_prefix = get_guessed_fastq_prefix(fastqFile) baseDir = os.path.dirname(fastqFile) or '.' plot_name = baseDir + '/' + fastq_prefix + '_read_length.png' if logger is None: logfile = baseDir + '/' + fastq_prefix + '_read_length.log' logger = utils._get_logger(logfile,logger_name=SUB) read_lengths = [ len(read.seq) for read in (get_genericfastq_iterator(fastqFile))] np_read_lengths = np.array(read_lengths,dtype=int) #plot the histogram fig,ax = plt.subplots() bins = ax.hist(np_read_lengths,bins=50,color='darkgreen') ax.set_ylabel('read counts') ax.set_xlabel('read_length') ax.set_title(fastq_prefix) plt.savefig(plot_name) logger.info('created fastq read length histogram for : %s' % (fastqFile)) logger.info('Read Length stats: %0.2f +/- %0.2f' %(np.mean(np_read_lengths),np.std(np_read_lengths)))
def train(): utils._makedirs("../logs") utils._makedirs("../output") logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp()) dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t") dfTrain.columns = ["id", "left", "right", "label"] dfTrain.dropna(inplace=True) # shuffle training data dfTrain = dfTrain.sample(frac=1.0) dp = DataProcessor(max_num_words=params["max_num_words"], max_num_chars=params["max_num_chars"]) dfTrain = dp.fit_transform(dfTrain) N = dfTrain.shape[0] train_ratio = 0.6 train_num = int(N * train_ratio) X_train = get_model_data(dfTrain[:train_num], params) X_valid = get_model_data(dfTrain[train_num:], params) model = SemanticMatchingModel(model_name, params, logger=logger, threshold=0.2) model.fit(X_train, validation_data=X_valid, shuffle=False) # save model model.save_session() with open("dp.pkl", "wb") as f: pkl.dump((dp, model.threshold), f, protocol=2)
def __fix_dirs_N_initialize_logger(args): #outdir if args.outdir is None: args.outdir = os.path.abspath(os.path.dirname(args.read1[0])) if args.prefix is None: args.prefix = '%s_%s' % (os.path.basename(__file__).replace('.py',''), time.strftime('%Y-%m-%d_%H-%M-%S')) else: args.prefix = '%s_%s' % (args.prefix,time.strftime('%Y-%m-%d_%H-%M-%S')) #get the logger log_file_name = args.outdir + '/' + args.prefix + '.log' args.logger = utils._get_logger(log_file_name)
def __fix_dirs_N_initialize_logger(args): #outdir if args.outdir is None: args.outdir = os.path.abspath(os.path.dirname(args.read1[0])) if args.prefix is None: args.prefix = '%s_%s' % (os.path.basename(__file__).replace( '.py', ''), time.strftime('%Y-%m-%d_%H-%M-%S')) else: args.prefix = '%s_%s' % (args.prefix, time.strftime('%Y-%m-%d_%H-%M-%S')) #get the logger log_file_name = args.outdir + '/' + args.prefix + '.log' args.logger = utils._get_logger(log_file_name)
def __init__(self,reference,logger=None,combined_fastq = None, forceRun = False, read1 = None, read2 = None, ncores = 2, outPath=os.getcwd()): if read1 is None and combined_fastq is None: print '[BWA]: Error: No fastq file provided\n' sys.exit(2) self.reference = os.path.abspath(reference) self.forceRun = forceRun self.ncores = ncores self.combined_fastq = combined_fastq self.read1 = read1 self.read2 = read2 self.outPath = outPath #create a new mapping directory mapDir = outPath + '/mapping' if not os.path.exists(mapDir): os.makedirs(mapDir) self.mapDir = mapDir #setup the logger if logger is None: _logfileName = outPath + '/BWA_processing.log' logger = utils._get_logger(_logfileName) self.logger = logger #mapping related self.fastq_prefix = fastqUtils.get_guessed_fastq_prefix(os.path.basename(self.read1)) self.bwa_map_log = self.mapDir + '/' + self.fastq_prefix + '.bwa.log' self.mappedBam = self.mapDir + '/' + self.fastq_prefix + '.bam' #create the reference index if needed final_refPath = creat_bwa_ref_index(reference,logger,outPath=mapDir) self.reference = final_refPath
from model import LogisticRegression, DNN, RankNet, LambdaRank from prepare_data import label_file_pat, group_file_pat, feature_file_pat def load_data(type): labels = np.load(label_file_pat % type) qids = np.load(group_file_pat % type) features = np.load(feature_file_pat % type) X = {"feature": features, "label": labels, "qid": qids} return X utils._makedirs("logs") logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp()) params_common = { # you might have to tune the batch size to get ranknet and lambdarank working # keep in mind the followings: # 1. batch size should be large enough to ensure there are samples of different # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method" # this ensure the gradients are nonzeros and stable across batches, # which is important for pairwise method, e.g., ranknet and lambdarank # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank # (which are of size batch_size x batch_size) will consume large memory space "batch_size": 128, "epoch": 30, "feature_dim": 60, "batch_sampling_method": "sample", "shuffle": True,