Example #1
0
def create_fastq_length_histogram(fastqFile,logger=None):
    SUB='create_fastq_length_histogram'
    
    fastq_prefix = get_guessed_fastq_prefix(fastqFile)
    baseDir = os.path.dirname(fastqFile) or '.'
    plot_name = baseDir + '/' + fastq_prefix + '_read_length.png'
    
    
    if logger is None:
        logfile = baseDir + '/' + fastq_prefix + '_read_length.log'
        logger = utils._get_logger(logfile,logger_name=SUB)
    
    
    read_lengths = [ len(read.seq)  for read in (get_genericfastq_iterator(fastqFile))]
    np_read_lengths = np.array(read_lengths,dtype=int)

    #plot the histogram
    fig,ax = plt.subplots()
    bins = ax.hist(np_read_lengths,bins=50,color='darkgreen')
    ax.set_ylabel('read counts')
    ax.set_xlabel('read_length')
    ax.set_title(fastq_prefix)
    plt.savefig(plot_name)
    
    logger.info('created fastq read length histogram for : %s' % (fastqFile))
    logger.info('Read Length stats: %0.2f +/- %0.2f' %(np.mean(np_read_lengths),np.std(np_read_lengths)))
Example #2
0
def train():

    utils._makedirs("../logs")
    utils._makedirs("../output")
    logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp())

    dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t")
    dfTrain.columns = ["id", "left", "right", "label"]

    dfTrain.dropna(inplace=True)

    # shuffle training data
    dfTrain = dfTrain.sample(frac=1.0)

    dp = DataProcessor(max_num_words=params["max_num_words"],
                       max_num_chars=params["max_num_chars"])
    dfTrain = dp.fit_transform(dfTrain)

    N = dfTrain.shape[0]
    train_ratio = 0.6
    train_num = int(N * train_ratio)
    X_train = get_model_data(dfTrain[:train_num], params)
    X_valid = get_model_data(dfTrain[train_num:], params)

    model = SemanticMatchingModel(model_name,
                                  params,
                                  logger=logger,
                                  threshold=0.2)
    model.fit(X_train, validation_data=X_valid, shuffle=False)

    # save model
    model.save_session()
    with open("dp.pkl", "wb") as f:
        pkl.dump((dp, model.threshold), f, protocol=2)
Example #3
0
def __fix_dirs_N_initialize_logger(args):
	#outdir 
	if args.outdir is None:
		args.outdir = os.path.abspath(os.path.dirname(args.read1[0]))
	if args.prefix is None:
		args.prefix = '%s_%s' % (os.path.basename(__file__).replace('.py',''), time.strftime('%Y-%m-%d_%H-%M-%S'))
	else:
		args.prefix = '%s_%s' % (args.prefix,time.strftime('%Y-%m-%d_%H-%M-%S'))
	
	#get the logger
	log_file_name = args.outdir + '/' + args.prefix + '.log'
	args.logger = utils._get_logger(log_file_name)
Example #4
0
def __fix_dirs_N_initialize_logger(args):
    #outdir
    if args.outdir is None:
        args.outdir = os.path.abspath(os.path.dirname(args.read1[0]))
    if args.prefix is None:
        args.prefix = '%s_%s' % (os.path.basename(__file__).replace(
            '.py', ''), time.strftime('%Y-%m-%d_%H-%M-%S'))
    else:
        args.prefix = '%s_%s' % (args.prefix,
                                 time.strftime('%Y-%m-%d_%H-%M-%S'))

    #get the logger
    log_file_name = args.outdir + '/' + args.prefix + '.log'
    args.logger = utils._get_logger(log_file_name)
Example #5
0
File: BWA.py Project: apratap/appys
 def __init__(self,reference,logger=None,combined_fastq = None,
              forceRun = False,
              read1 = None,
              read2 = None,
              ncores = 2,
              outPath=os.getcwd()):
     
     if read1 is None and combined_fastq is None:
         print '[BWA]: Error: No fastq file provided\n'
         sys.exit(2)
     
 
     self.reference          =   os.path.abspath(reference)
     self.forceRun           =   forceRun
     self.ncores             =   ncores
     self.combined_fastq     =   combined_fastq
     self.read1              =   read1
     self.read2              =   read2
     self.outPath            =   outPath
     
     
     
     #create a new mapping directory
     mapDir = outPath + '/mapping'
     if not os.path.exists(mapDir):
         os.makedirs(mapDir)
     self.mapDir              =  mapDir
     
     #setup the logger
     if logger is None:
         _logfileName  = outPath + '/BWA_processing.log'
         logger = utils._get_logger(_logfileName)
     self.logger = logger                            
     
     #mapping related
     self.fastq_prefix            =   fastqUtils.get_guessed_fastq_prefix(os.path.basename(self.read1))
     self.bwa_map_log             =   self.mapDir + '/' + self.fastq_prefix  + '.bwa.log'
     self.mappedBam               =   self.mapDir + '/' + self.fastq_prefix  + '.bam'
     
     #create the reference index if needed
     final_refPath = creat_bwa_ref_index(reference,logger,outPath=mapDir)
     self.reference = final_refPath
Example #6
0
from model import LogisticRegression, DNN, RankNet, LambdaRank
from prepare_data import label_file_pat, group_file_pat, feature_file_pat


def load_data(type):

    labels = np.load(label_file_pat % type)
    qids = np.load(group_file_pat % type)
    features = np.load(feature_file_pat % type)

    X = {"feature": features, "label": labels, "qid": qids}
    return X


utils._makedirs("logs")
logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp())

params_common = {
    # you might have to tune the batch size to get ranknet and lambdarank working
    # keep in mind the followings:
    # 1. batch size should be large enough to ensure there are samples of different
    # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method"
    # this ensure the gradients are nonzeros and stable across batches,
    # which is important for pairwise method, e.g., ranknet and lambdarank
    # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank
    # (which are of size batch_size x batch_size) will consume large memory space
    "batch_size": 128,
    "epoch": 30,
    "feature_dim": 60,
    "batch_sampling_method": "sample",
    "shuffle": True,