def conservation_score(f_chromsizes, d_phastcons, in_gff, out_avcons): tmp = random_string(12) d_split = out_avcons + 'gff_by_chromosome_'+tmp d_cons = out_avcons + 'conservation_'+tmp [os.makedirs(d) for d in [d_split, d_cons] if not os.path.exists(d)] f_cons = out_avcons + 'conservation.txt' f_aver_cons = out_avcons ## get chromosomes chromosomes = [] with open(f_chromsizes) as f: for l in f: c = l.split('\t')[0] if (('random' not in c) and ('chrM' not in c) and ('chrUn' not in c)): chromosomes.append(c[3:]) ## separate infile by chromosome for c in chromosomes: f_out = os.path.join(d_split, 'tss_filtered_all_'+c+'.gff') with open(f_out, 'w') as out: with open(in_gff) as f: for line in f: chrom = line.split('\t')[0] if (chrom == c): out.write(line) ## calculate conservation per chromosome _conservation(chromosomes, d_split, d_cons, d_phastcons) ## merge chromosomes os.system("cat "+d_cons+"/conservation_all_*txt > "+f_cons) ## get average conservation _average_conservation(f_cons, f_aver_cons) ## cleanup is_same = [] for c in chromosomes: n_gff = line_count(os.path.join(d_split, 'tss_filtered_all_'+c+'.gff')) n_con = line_count(os.path.join(d_cons, 'conservation_all_'+c+'.txt')) is_same.append(n_gff == n_con) if all_same(is_same): os.system('rm -r %s %s %s' % (d_split, d_cons, f_cons)) else: not_equal = [chromosomes[i] for i,v in enumerate(is_same) if not v] sys.exit('Error: Total number of positions does not match for chr: ' + ' '.join(not_equal)) ## sort average conservation sorted_avcons = out_avcons + '.sorted.tmp' cmd = "sort -k1,2 -n "+out_avcons+" > "+sorted_avcons os.system(cmd) return sorted_avcons
def load_main_file(self): """ Start asynchronously loading stored filename. If we land in this method we are assured that the store filename is valid. When loading is done, method file_loaded_cb is called""" # Lock all inputs, clear the plot self.rnd_doc_btn.config(state="disabled") self.ready_lb.config(bg="red") self.dp_plt_wg.config(state="disabled") utils.Plotting.reset_plot(self.mpl_ax) self.canvas.draw() self.txt_entry_doc.delete(0, END) self.txt_entry_doc.insert(0, self.text_entry_default) self.dp_plt_var.set('None') self.status_var.set("Loading data file") lines = utils.line_count(self.model.filename) self.progressbar["maximum"] = ceil( lines / utils.CHUNK_SIZE) # int(X) + 1 doesn't work for whole values self.file_info_var.set("{0} entries".format(lines)) self.model.load_main_file_async( { 'filename': self.model.filename, 'linecount': lines }, callback=self.file_loaded_cb, pg_val=self.pg_val)
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out): ## check that all in files contain same number of data lines n_g = line_count(sorted_gff) n_c = line_count(sorted_cpg) n_a = line_count(sorted_avcons) n_t = line_count(sorted_tata) if not all_same([n_g, n_c, n_a, n_t]): sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' % n_g, n_c, n_a, n_t) ## create matrix lcount = 0 with open(f_out, 'w') as out: with open(sorted_gff) as f: for l in f: lcount += 1 l = l.strip().split('\t') c = l[0] region_up = l[3] #500bp upstream of start; not used region_down = l[4] #500bp downstream of start; not used count = l[5] strand = l[6] info = l[8].split(';') #dist_score = '?' peak_start = get_value_from_keycolonvalue_list('start', info) peak_stop = get_value_from_keycolonvalue_list('stop', info) CpG_value = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3] try: conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2] except: conservation = '0' affinity = linecache.getline(sorted_tata,lcount).strip().split('\t')[7] features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity]) new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down]) line = '\t'.join([c, l[1], l[2], peak_start, peak_stop, count, strand, features, new_info]) out.write(line + '\n')
def __init__(self, name, work_dir, data_dir, output_root): super(Configuration, self).__init__() # file related params self.work_dir = work_dir self.data_dir = data_dir self.vocab_path = os.path.join(self.data_dir, 'vocabulary') self.embed_path = os.path.join(self.data_dir, 'glove_embedding.npy') self.vocab_size = line_count(self.vocab_path, skip_empty=True) with setups(self): with immutables(self): self.start_time = current_datetime() self.name = name self.output_dir = os.path.join(output_root, name) self.train_path = os.path.join(self.data_dir, 'trainyiseg.csv') self.train_eval_path = os.path.join(self.data_dir, 'trainyiseg_eval.csv') self.valid_path = os.path.join(self.data_dir, 'validyiseg.csv') self.test_path = os.path.join(self.data_dir, 'testayiseg.csv') self.model_path = os.path.join(self.output_dir, 'model') self.elmo_path = os.path.join(self.data_dir, 'elmo', 'model') self.num_aspects = 20 self.visible_gpus = '0' # visible GPUs self.num_gpus = len(self.visible_gpus.split(',')) os.environ['CUDA_VISIBLE_DEVICES'] = self.visible_gpus with structures(self): self.hidden_size = 512 self.embed_size = 300 self.atn_units = 300 self.num_layers = 1 self.use_elmo = False with learning(self): # training process params self.load_embed = True self.keep_prob = 0.65 self.rnn_kernel_keep_prob = 0.8 self.max_epoch = 50 self.grad_clip_max_norm = 5.0 self.early_stop_epoch = 10 # input params self.batch_size = 64 self.eval_batch_size = 64
def execute_task(self, args): """ Description : Stores sanitized argument variables and dispatches task to the relevant method Parameters : Parsed CLI arguments """ self.args = args if self.model.check_file_validity(args.input_file): self.load_main_file({ 'filename': args.input_file, 'linecount': utils.line_count(str(args.input_file)) }) self.doc = self.get_doc(args.doc_uuid, args.task_id) self.user = self.get_user(args.user_uuid) if args.task_id in ['4d', '5']: self.sort = self.get_sort(args.sort) task_func = self.dispatch.get(args.task_id) task_func() else: raise utils.InvalidArgumentError("Input file is not valid !")
def load_data(base_name, plevel, ulevel, hlength, sv=False): """ Load and pre-format the Foodmart data (products, customers and user sessions). Args: * ``base_name`` (*str*): path to the main data folder. * ``plevel`` (*int*): level parameter for the product clustering. * ``hlength`` (*int*): history length. * ``sv`` (*bool, optional*): if True, store the computed informations in .items, .profiles, .train and .test Returns: * ``product_to_cluster`` (*ndarray*): maps a productID to a clusterID. Note 0 -> -1 is the empty selection. * ``customer_to_cluster`` (*ndarray*): maps a customerID to a clusterID. """ # Init output folder if sv: output_base = init_output_dir(plevel, ulevel, hlength) ###### Load and Cluster items ######################################################################### print("\n\033[92m-----> Load and Cluster products\033[0m") product_to_cluster = np.zeros(line_count(load_datafile(base_name, "product.csv")) + 1, dtype=int) # Product ID -> Cluster ID tmp_index = {} # Cluster name -> Cluster ID tmp_clusters = defaultdict(lambda: []) # Cluster name -> Product ID list # Load product list if plevel == 0: f = load_datafile(base_name, "product.csv") r = csv.reader(f) next(r) for product in r: tmp_clusters[product[3]].append(int(product[1])) try: product_to_cluster[int(product[1])] = tmp_index[product[3]] except KeyError: tmp_index[product[3]] = len(tmp_index) + 1 product_to_cluster[int(product[1])] = tmp_index[product[3]] f.close() else: # Load product categories product_classes = {} f = load_datafile(base_name, "product_class.csv") r = csv.reader(f) next(r) for categories in r: product_classes[int(categories[0])] = categories[plevel] f.close() # Cluster products f = load_datafile(base_name, "product.csv") r = csv.reader(f) next(r) for product in r: try: product_to_cluster[int(product[1])] = tmp_index[product_classes[int(product[0])]] except KeyError: tmp_index[product_classes[int(product[0])]] = len(tmp_index) + 1 product_to_cluster[int(product[1])] = tmp_index[product_classes[int(product[0])]] tmp_clusters[product_classes[int(product[0])]].append(int(product[1])) f.close() # Print summary print(" %d product profiles (%d products)" % (len(tmp_index), (len(product_to_cluster) - 1))) print('\n'.join(" > %s: %.2f%%" % (k, 100 * float(len(v)) / (len(product_to_cluster) - 1)) for k, v in iteritems(tmp_clusters))) actions = sorted(itervalues(tmp_index)) product_to_cluster[0] = 0 # Empty selection # Init states print("\n\033[92m-----> [Optional] Export states description\033[0m") init_base_writing(len(actions), args.history) if sv: rv_tmp_indx = {v: k for k, v in tmp_index.items()} rv_tmp_indx[0] = str(chr(35)) with open("%s.states" % output_base, 'w') as f: f.write('\n'.join("%f\t%s" % (x, '|'.join(rv_tmp_indx[y] for y in id_to_state(x))) for x in xrange(get_nstates(len(actions), args.history)))) ###### Load and Store user sessions ######################################################################### print("\n\033[92m-----> Load user sessions and shop profits \033[0m") user_sessions = defaultdict(lambda: [0] * hlength) # Load session f = load_datafile(base_name, "sales.csv") r = csv.reader(f) next(r) for sale in r: product_clusterID = product_to_cluster[int(sale[0])] user_sessions[int(sale[2])].append(product_clusterID) f.close() # Save product clusters information if sv: with open("%s.items" % output_base, 'w') as f: f.write('\n'.join("%d\t%s\t%d" %(tmp_index[k], k, len(tmp_clusters[k])) for k in sorted(tmp_index.keys(), key=lambda x: tmp_index[x]))) # Return values return product_to_cluster, user_sessions, actions, output_base
data_dir_path = Path('data') file_names = [x.name for x in data_dir_path.glob('*') if x.is_file()] #prints the sizes of both datasets import os print('---Size of each dataset---\n') print('Super Bowl History:') print(os.path.getsize('data/datasets_superbowl.csv')) print('\nSuper Bowl Ads:') print(os.path.getsize('data/datasets_superbowl-ads.csv')) #prints the amount of rows that are in each dataset from utils import line_count print("---Number of rows from each dataset---\n") print("Super Bowl History:") line_num = line_count('data/datasets_superbowl.csv') print(line_num) print("\nSuper Bowl Ads:") line_num = line_count('data/datasets_superbowl-ads.csv') print(line_num) #prints all columns and one row from each dataset from utils import head print("All columns and one row from each dataset\n") print("Super Bowl History:") print(head('data/datasets_superbowl.csv', 2)) print("\nSuper Bowl Ads:") print(head('data/datasets_superbowl-ads.csv', 2)) #-------------------------------------------------------#
import numpy as np from utils import line_count import ipdb rd = redis.StrictRedis() raw_dict = open('data/resource.txt', 'r').read() raw_dict = literal_eval(raw_dict) entity_lst = raw_dict['csk_entities'] if not os.path.isfile('freq_dict.pkl'): freq_dict = Counter() num_lines = line_count('data/trainset.jsonl') with jsonlines.open('data/trainset.jsonl', mode='r') as reader: for _, line in zip(tqdm(range(num_lines)), reader): freq_dict.update(line['post']) freq_dict.update(line['response']) with open('freq_dict.pkl', 'wb') as f: pickle.dump(freq_dict, f) with open('freq_dict.pkl', 'rb') as f: freq_dict = pickle.load(f) def is_in_khop(k_exp): reply = rd.execute_command( 'GRAPH.QUERY', 'CCM',
def process_file(root, inp): start_i, filename = inp n_sample = line_count(filename) post = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) post_length = np.zeros( (n_sample), dtype=np.int32) # valid length (without pad) response = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) response_length = np.zeros((n_sample), dtype=np.int32) # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) triple = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len, 3), dtype=np.int32) entity = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len), dtype=np.int32) response_triple = np.zeros( (n_sample, self.args.max_sentence_len, 3), dtype=np.int32) max_post_len, max_response_len, max_triple_len = 0, 0, 0 with jsonlines.open(filename) as df: for i, line in enumerate(df): pl, rl = len(line['post']) + 2, len(line['response']) + 2 post_length[i] = pl response_length[i] = rl max_post_len = max(pl, max_post_len) max_response_len = max(rl, max_response_len) max_triple_len = max([len(l) for l in line['all_triples']] + [max_triple_len]) all_triples = [ line['all_triples'][i - 1] if i > 0 else [-1] for i in line['post_triples'] ] post[i, :pl] = [SOS_IDX] + [ self.get_word_idx(p) for p in line['post'] ] + [EOS_IDX] response[i, :rl] = [SOS_IDX] + [ self.get_word_idx(r) for r in line['response'] ] + [EOS_IDX] # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...] response_triple[i, :rl] = [NAF_TRIPLE] + [ transform_triple_to_hrt(rt) for rt in line['response_triples'] ] + [NAF_TRIPLE] # put NAF_TRIPLE/entity at index 0 triple[i] = pad_2d( [[NAF_TRIPLE]] + [[transform_triple_to_hrt(t) for t in triples] for triples in all_triples] + [[NAF_TRIPLE]], length=(self.args.max_sentence_len, self.args.max_triple_len, 3)) entity[i] = pad_2d( [[NAF_IDX]] + [[self.entidx2wordidx[e] for e in entities] for entities in line['all_entities']] + [[NAF_IDX]], length=(self.args.max_sentence_len, self.args.max_triple_len)) # dump to zarr root['post'][start_i:start_i + n_sample] = post root['post_length'][start_i:start_i + n_sample] = post_length root['response'][start_i:start_i + n_sample] = response root['response_length'][start_i:start_i + n_sample] = response_length # root['post_triple'][start_i : start_i+n_sample] = post_triple root['triple'][start_i:start_i + n_sample] = triple root['entity'][start_i:start_i + n_sample] = entity root['response_triple'][start_i:start_i + n_sample] = response_triple return max_post_len, max_response_len, max_triple_len
def init_data(self, data_name, n_chunk=1024): print(f'Initializing {data_name} data...') def transform_triple_to_hrt(triple_idx): """ Transforms triple-idx (as a whole) to h/r/t format """ if triple_idx == -1: # for response_triple return NAF_TRIPLE triple = self.idx2triple[triple_idx] h, r, t = triple.split(', ') return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]] def process_file(root, inp): start_i, filename = inp n_sample = line_count(filename) post = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) post_length = np.zeros( (n_sample), dtype=np.int32) # valid length (without pad) response = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) response_length = np.zeros((n_sample), dtype=np.int32) # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) triple = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len, 3), dtype=np.int32) entity = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len), dtype=np.int32) response_triple = np.zeros( (n_sample, self.args.max_sentence_len, 3), dtype=np.int32) max_post_len, max_response_len, max_triple_len = 0, 0, 0 with jsonlines.open(filename) as df: for i, line in enumerate(df): pl, rl = len(line['post']) + 2, len(line['response']) + 2 post_length[i] = pl response_length[i] = rl max_post_len = max(pl, max_post_len) max_response_len = max(rl, max_response_len) max_triple_len = max([len(l) for l in line['all_triples']] + [max_triple_len]) all_triples = [ line['all_triples'][i - 1] if i > 0 else [-1] for i in line['post_triples'] ] post[i, :pl] = [SOS_IDX] + [ self.get_word_idx(p) for p in line['post'] ] + [EOS_IDX] response[i, :rl] = [SOS_IDX] + [ self.get_word_idx(r) for r in line['response'] ] + [EOS_IDX] # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...] response_triple[i, :rl] = [NAF_TRIPLE] + [ transform_triple_to_hrt(rt) for rt in line['response_triples'] ] + [NAF_TRIPLE] # put NAF_TRIPLE/entity at index 0 triple[i] = pad_2d( [[NAF_TRIPLE]] + [[transform_triple_to_hrt(t) for t in triples] for triples in all_triples] + [[NAF_TRIPLE]], length=(self.args.max_sentence_len, self.args.max_triple_len, 3)) entity[i] = pad_2d( [[NAF_IDX]] + [[self.entidx2wordidx[e] for e in entities] for entities in line['all_entities']] + [[NAF_IDX]], length=(self.args.max_sentence_len, self.args.max_triple_len)) # dump to zarr root['post'][start_i:start_i + n_sample] = post root['post_length'][start_i:start_i + n_sample] = post_length root['response'][start_i:start_i + n_sample] = response root['response_length'][start_i:start_i + n_sample] = response_length # root['post_triple'][start_i : start_i+n_sample] = post_triple root['triple'][start_i:start_i + n_sample] = triple root['entity'][start_i:start_i + n_sample] = entity root['response_triple'][start_i:start_i + n_sample] = response_triple return max_post_len, max_response_len, max_triple_len toread = [ f'{self.data_path}/{data_name}set_pieces/{piece}' for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces') ] n_lines = sum([line_count(piece) for piece in toread]) init_n_lines = math.ceil( n_lines / n_chunk) * n_chunk # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지 root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w') post = root.zeros('post', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') post_length = root.zeros('post_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') # valid length (without pad) response = root.zeros('response', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') response_length = root.zeros('response_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') post_triple = root.zeros('post_triple', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') triple = root.zeros('triple', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len, 3), chunks=(n_chunk, None, None, None), dtype='i4') entity = root.zeros('entity', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len), chunks=(n_chunk, None, None), dtype='i4') response_triple = root.zeros('response_triple', shape=(init_n_lines, self.args.max_sentence_len, 3), chunks=(n_chunk, None, None), dtype='i4') pool = Pool(min(len(toread), mp.cpu_count())) func = functools.partial(process_file, root) iterinp = [(i * self.args.data_piece_size, filename) for i, filename in enumerate(toread)] max_post_lens, max_response_lens, max_triple_lens = zip( *tqdm(pool.imap(func, iterinp), total=len(iterinp))) max_post_len, max_response_len, max_triple_len = max( max_post_lens), max(max_response_lens), max(max_triple_lens) # trim remaining space post.resize(n_lines, max_post_len) post_length.resize(n_lines) response.resize(n_lines, max_response_len) response_length.resize(n_lines) post_triple.resize(n_lines, max_post_len) triple.resize(n_lines, max_post_len, max_triple_len, 3) entity.resize(n_lines, max_post_len, max_triple_len) response_triple.resize(n_lines, max_response_len, 3) print( f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')
def main(files, outdir, N, percent_lib, is_get_id, f_config, verbose=False): if os.path.isdir(outdir): sys.exit('## ERROR: "%s" already exists' % outdir) cparser = SafeConfigParser() cparser.read(f_config) verbose = True f_mirbasegff = cparser.get('mirbase', 'gff2') f_chromsizes = cparser.get('genome', 'chromsizes') f_repeats = cparser.get('genome', 'repeats') f_ensembl = cparser.get('genome', 'ensemblgtf') f_fasta = cparser.get('genome', 'fasta') d_phastcons = cparser.get('cons', 'phastcons') TRAP = cparser.get('tata', 'trap') f_psemmatrix = cparser.get('tata', 'psem') f_traincfg = cparser.get('configs', 'tcconfig') m_mirna = cparser.get('correlation', 'srnaseqmatrix') m_tss = cparser.get('correlation', 'cageseqmatrix') corrmethod = cparser.get('correlation', 'corrmethod') f_trainingset = os.path.join(outdir, 'TrainingSet.gff') outdir1 = f_trainingset + '_intermediates' ensure_dir(outdir, False) ensure_dir(outdir1, False) _files = glob.glob(files) ## creating auxillary file for negative set f_fiveprimegff = '../data/hsa.five_prime.gff' if not os.path.exists(f_fiveprimegff): if verbose: print 'STATUS: creating "%s" auxillary file...' % f_fiveprimegff extract_tss_from_ensembl(f_ensembl, f_fiveprimegff) ## create training set gff_ts_pos = os.path.join(outdir1, 'trainingset_pos.gff') gff_ts_neg = os.path.join(outdir1, 'trainingset_neg.gff') if verbose: print 'STATUS: creating positive candidate set...' create_positiveset(percent_lib, _files, f_mirbasegff, N, gff_ts_pos, is_get_id) if verbose: print 'STATUS: creating negative candidate set...' create_negativeset(f_chromsizes, f_repeats, f_fiveprimegff, f_traincfg, N, gff_ts_neg) shutil.move(os.path.join(outdir1, 'tc-norm_negSet'), os.path.join(outdir, 'tc-norm_negSet')) ## feature extraction: cpg, cons, tata (features.py) if verbose: print 'STATUS: extracting features cpg/cons/tata...' gff_1kbfeatures_pos = os.path.join(outdir1, 'features1kb_ts_pos.gff') gff_1kbfeatures_neg = os.path.join(outdir1, 'features1kb_ts_neg.gff') features.main(gff_ts_pos, outdir1, f_fasta, f_chromsizes, d_phastcons, TRAP, f_psemmatrix, gff_1kbfeatures_pos) features.main(gff_ts_neg, outdir1, f_fasta, f_chromsizes, d_phastcons, TRAP, f_psemmatrix, gff_1kbfeatures_neg) ## feature extraction: mirna_proximity if verbose: print 'STATUS: extracting features mirna_proximity...' gff_mirnaprox_pos = os.path.join(outdir1, 'featureMprox_ts_pos.gff') gff_mirnaprox_neg = os.path.join(outdir1, 'featureMprox_ts_neg.gff') mirna_proximity.main(gff_ts_pos, f_mirbasegff, gff_mirnaprox_pos) mirna_proximity.main(gff_ts_neg, f_mirbasegff, gff_mirnaprox_neg) gff_features_pos = os.path.join(outdir1, 'Features_ts_pos.gff') gff_features_neg = os.path.join(outdir1, 'Features_ts_neg.gff') gff_unify_features.main(gff_1kbfeatures_pos, gff_mirnaprox_pos, 'mirna_prox', '0', gff_features_pos, True) gff_unify_features.main(gff_1kbfeatures_neg, gff_mirnaprox_neg, 'mirna_prox', '0', gff_features_neg, True) ## create final training set ... ## where background must pass criteria: cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mirna_prox == 0 if verbose: print 'STATUS: creating final training set...' good_background = gff_features_neg + '_cpglt0.5-conslt0.2-tatalt0.1-mproxeq0.gff' with open(good_background, 'w') as out: with open(gff_features_neg) as f: for line in f: info = line.strip().split('\t')[7].split(';') cpg = float(get_value_from_keycolonvalue_list('cpg', info)) cons = float(get_value_from_keycolonvalue_list('cons', info)) tata = float(get_value_from_keycolonvalue_list('tata', info)) mprx = float( get_value_from_keycolonvalue_list('mirna_prox', info)) if cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mprx == 0: out.write(line) wc = line_count(good_background) selectedlines = random.sample(range(1, wc + 1), N) with open(f_trainingset, 'w') as out: ## writing negative set for l in selectedlines: out.write(linecache.getline(good_background, l)) ## writing positive set with open(gff_features_pos) as f: ## when mirna_prox extraction feature was used, ## extracted all pairs within 50kb upstream mirna ## -> single tss could have many mirna ## take pair with min distance ## -> essential first entry pos_list = [] for line in f: l = line.split('\t') pos = ','.join([l[0], l[3], l[4], l[6]]) if not (pos in pos_list): pos_list.append(pos) out.write(line) if not (os.path.isfile(m_mirna) and os.path.isfile(m_tss)): return f_trainingset ## create final training set with feature:correlation of closest tss->miRNA ... if verbose: print 'STATUS: creating final training set with correlation of closest tss->miRNA...' f_trainingset2 = os.path.join(outdir, 'TrainingSet-corr.gff') m_back = glob.glob('%s/tc-norm_negSet/*tpm_rle.matrix' % outdir)[0] f_tcfilesinput = os.path.join(outdir, 'tc-norm_negSet', 'files.txt') feature_closest_corr(f_trainingset, f_mirbasegff, m_mirna, m_tss, m_back, f_tcfilesinput, corrmethod, f_trainingset2) return f_trainingset2