def writeEvalOutput(self): self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n") self.outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=self.concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) self.outputfh.write( gt +"\t"+outstr+"\n") self.outputfh.write( "matrix sum: \n") summy=np.sum(self.concordancetable) self.outputfh.write( str(summy) +"\n") #now we figure out how many sites were called or not called self.calledtable[0,0]=self.concordancetable[0:3,0:3].sum() self.calledtable[0,1]=self.concordancetable[0:3,3].sum() self.calledtable[1,0]=self.concordancetable[3,0:3].sum() self.calledtable[1,1]=self.concordancetable[3,3] self.outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n") self.outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=self.calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) self.outputfh.write( gt +"\t"+outstr+"\n") self.outputfh.write( "matrix sum: \n") summy=np.sum(self.calledtable) self.outputfh.write( str(summy) +"\n") self.outputfh.write("\n") discordance=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] total=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,1]+ self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] +self.concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= self.concordancetable[1,1]+ self.concordancetable[1,2]+ self.concordancetable[2,1]+ self.concordancetable[2,2] variant_count_comparison= self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,1]+self.concordancetable[1,2]+self.concordancetable[2,1]+self.concordancetable[2,2]+self.concordancetable[3,1]+self.concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) self.outputfh.write( "NRD: " + str(nrd) +" \n") self.outputfh.write( "NRS " + str(nrs) +" \n") outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) ) self.genotypematrixfh.write(outstring+"\n")
def main(): usage = "usage: %prog [options] file.vcf \n output format values from genotype data field in a merged VCF generated by CombineVariants from GATK for suitabale plotting/dataviz" parser = OptionParser(usage) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)") (options, args)=parser.parse_args() vcfilename=args[0] #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf' basename=os.path.splitext(vcfilename)[0] vcfobj=VcfFile(vcfilename) vcfh=open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" samples=vcfobj.getSampleList() print "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): vrec_ziptuple=vrec.zipGenotypes(samples) outputs=[] for (compare, eval) in grouper(2,vrec_ziptuple): compareGenobj= compare[1] evalGenobj= eval[1] outputs.append( "\t".join( [compareGenobj.getFormatVal(options.format), evalGenobj.getFormatVal(options.format) ] ) ) print "\t".join(outputs)
def eval_batch(data_all, logit_all, in_train=False): out_list = [] for batch, logit in zip(grouper(data_all, bs), grouper(logit_all, bs)): batch = [ b if isinstance(b, torch.Tensor) else torch.from_numpy(b) for b in batch if b is not None ] logit = [ b if isinstance(b, torch.Tensor) else torch.from_numpy(b) for b in logit if b is not None ] out_batch = net( torch.stack(batch, dim=0).cuda(), torch.stack(logit, dim=0).cuda(), in_train) out_list.append(out_batch) out = torch.cat(out_list, dim=0) return out
def aes_cbc_encrypt(message, key, iv): ciphertext = '' key_size = len(key) for block in common.grouper(message,key_size,fillvalue='\x00'): block = ''.join(block) # Convert grouper array into byte string ciphertext_cbc = do_aes_cbc_encrypt_chain(block, key, iv) iv = ciphertext_cbc ciphertext += ciphertext_cbc return ciphertext
def __str__(self): rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] outstring="\t".join(['','AA','AB','BB', './.']) +"\n" for (i, gt) in grouper(2,rownames): row=self.concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outstring+=( gt +"\t"+outstr+"\n") outstring+="eval: "+ self.evalname outstring+=" compare: "+ self.comparename +"\n" return outstring
def is_ecb_mode(ciphertext, key_size=16): #print ('Key size: {}'.format(key_size)) # Break cipher into key sized blocks and store each block as array elem cipher_blocks = [b for cipher_block in common.grouper(ciphertext,key_size) for b in [''.join(cipher_block)]] # Get count of all similar cipher blocks matched_blocks = [{byte:count} for byte,count in Counter(cipher_blocks).items() if count>1] # Total up all matched cipher blocks total_blocks_match = sum([count for byte in matched_blocks for count in byte.values()]) #print ('Number of cipher blocks with match were {}, and total matched blocks {}'.format(len(matched_blocks),total_blocks_match)) return matched_blocks
def main(): usage = "usage: %prog [options] nrd.log.vcf\n" parser = OptionParser(usage) # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) (options, args) = parser.parse_args() vcfilename = args[0] basename = os.path.splitext(vcfilename)[0] vcfobj = VcfFile(vcfilename) vcfh = open(vcfilename, "r") nrdallfh = open(basename + ".allgenos.nrd.txt", "w") nrdtwofh = open(basename + ".twogenos.nrd.txt", "w") nrdonefh = open(basename + ".onegenos.nrd.txt", "w") vcfobj.parseMetaAndHeaderLines(vcfh) samples = vcfobj.getSampleList() # print samples # print "#setname\t" + "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): outputline = [[vrec.getPos()]] setname = vrec.returnInfoDict()["set"] # which callset does the site belong to? outputline.append([setname]) # we aggregate genotypes per sample heere vrec_ziptuple = vrec.zipGenotypes(samples) # print vrec_ziptuple """ Since I'm testing against trio, NRD count can be 1 2 or 3 We keep track of the nrd count and print those records to the appropriate file: nrdallfh, nrdtwofh, nrdonefh """ nrd_count = 0 for (compare, eval) in grouper(2, vrec_ziptuple): (comp_allele1, comp_allele2) = compare[1].getAlleles() (eval_allele1, eval_allele2) = eval[1].getAlleles() eval_alleletype = typeofGenotype(eval_allele1, eval_allele2) comp_alleletype = typeofGenotype(comp_allele1, comp_allele2) if eval_alleletype == comp_alleletype: continue outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)]) nrd_count += 1 output = "\t".join(melt_lol(outputline)) """ depending on the nrd count, print the records to appropirate file(s) """ if nrd_count == 3: nrdallfh.write(output + "\n") if nrd_count == 2: nrdtwofh.write(output + "\n") if nrd_count == 1: nrdonefh.write(output + "\n")
def main(options, args): if not options.email: raise Exception('Must specify -e for profile email id') email = options.email if options.aes_key and (len(options.aes_key) == 32): aes_key = options.aes_key.decode('hex') else: # aes 128 bits aes_key = common_crypt.get_random_byte_string(128 / 8) print('aes key:{}'.format(aes_key.encode('hex'))) # Test decryption mode. cipher text and key must be specified if options.cipher_text and options.aes_key: cipher_text = options.cipher_text.decode('hex') plaintext_profile = common_crypt.do_aes_128_ecb_decryption( cipher_text, aes_key) print('Decrypted data of profile: {}'.format(plaintext_profile)) plaintext_profile = parse_key_values(plaintext_profile) print('And after parsing: {}'.format(plaintext_profile)) else: # Guess aes block size block_size = common_crypt.guess_ecb_block_size( common_crypt.do_aes_128_ecb, aes_key, email) print('Block size is: {}'.format(block_size)) cipher_text_profile = tamper_role(email, aes_key) hexdump.hexdump(cipher_text_profile) plaintext_profile = common_crypt.do_aes_128_ecb_decryption( cipher_text_profile, aes_key) print('Decrypted cipher text from attack is: {}'.format( plaintext_profile)) plaintext_profile = parse_key_values(plaintext_profile) print('And after parsing: {}'.format(plaintext_profile)) # break up cipher text into block size for cipher_text_block in common.grouper(cipher_text_profile, block_size): cipher_text_block = ''.join(cipher_text_block) hexdump.hexdump(cipher_text_block) print( common_crypt.do_aes_128_ecb_decryption(cipher_text_block, aes_key))
def aes_cbc_decrypt(message, key, iv): key_size = len(key) round = 0 previous_cipher_block = '' plaintext = '' for cipher_block in common.grouper(message,key_size,fillvalue='\x00'): cipher_block = ''.join(cipher_block) # Convert grouper array into byte string # First round of chain uses the specified iv instead of previous decrypted block if round == 0: plaintext_block = do_aes_cbc_decrypt_chain(cipher_block, key, iv) else: iv = previous_cipher_block plaintext_block = do_aes_cbc_decrypt_chain(cipher_block, key, iv) previous_cipher_block = cipher_block # This is needed for next round iv plaintext += plaintext_block round += 1 return plaintext
def main(): usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n" parser = OptionParser(usage) parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) (options, args)=parser.parse_args() vcfilename=args[0] basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0] """ row is eval, column is comparison make a numpy matrix to represent genotype concordance matrix """ concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix outputfile=".".join([basename, 'variantEval','txt']) outputfh=open(outputfile, 'w') #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set nrslog=".".join([basename, 'nrs','log']) nrdlog=".".join([basename, 'nrd','log']) filterlog=".".join([basename, 'filtered','log']) multialleliclog=".".join([basename, 'multiallelic','log']) concordancelog=".".join([basename, 'concordance','log']) fieldslog=".".join([basename, 'fields', 'log']) nrsfh=open(nrslog, 'w') nrdfh=open(nrdlog, 'w') filteredfh=open(filterlog, 'w') multifh=open(multialleliclog, 'w') concordancefh=open(concordancelog, 'w') fieldsfh=open(fieldslog, 'w') fieldsfh.write('set'+"\n") vcfobj=VcfFile(vcfilename) vcfh=gzip.open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" nrsfh.write(header) nrdfh.write(header) filteredfh.write(header) concordancefh.write(header) multifh.write(header) #outputfh.write(header) #multifh.write(header) samples=vcfobj.getSampleList() #for (comparename, evalname) in grouper(2,samples): # print comparename, evalname vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for (compare,eval) in grouper(2,samples) ] for evalObj in vcf_sample_eval_objects: evalObj.writeHeaders(header) totalrecords=0 pattern=';set=(\S+)' for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if ',' in vrec.getAlt() > 1: outstring=vrec.toStringwithGenotypes() + "\n" multifh.write(outstring) #continue """ skip homoz reference calls unless you want to include them! """ if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False: continue """ if variant is filtered, skip it! """ if 'filterIn' in vrec.getInfo() and options.includeFilter == False: outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue if 'FilteredInAll' in vrec.getInfo(): outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue #returns a list [ (samplename, vcfgenotype) , ... () ] vrec_ziptuple=vrec.zipGenotypes(samples) """ we make a hack and make a list like so: [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ... ] basically it halves the length of vrec_ziptuple and gives it the same structure as the list of VcfSampleEval objects""" compare_eval =[ compare+evalu for (compare,evalu) in grouper(2,vrec_ziptuple) ] #what set are you in? field=re.search(pattern, vrec.getInfo()).groups()[0] fieldsfh.write(field+"\n") totalrecords+=1 """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype """ for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects): #print genotype_tuple compare=genotype_tuple[0:2] eval=genotype_tuple[2::] #print compare #print eval (comp_allele1, comp_allele2)=compare[1].getAlleles() (eval_allele1, eval_allele2)=eval[1].getAlleles() eval_alleletype=typeofGenotype(eval_allele1, eval_allele2) comp_alleletype=typeofGenotype(comp_allele1, comp_allele2) """ increment the cell count """ concordancetable[eval_alleletype, comp_alleletype]+=1 evalObj.incrementcellcount(eval_alleletype,comp_alleletype) """write gentoype record to log appropriate log file """ #print records that contirubut the NRS penalty if eval_alleletype == 3: if comp_alleletype == 1 or comp_alleletype==2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring) evalObj.writeNrs(outstring) if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring ) evalObj.writeNrs(outstring) #print records that contribute to NRD penalty if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 0: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 1: if comp_alleletype == 0 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 1: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 2: if comp_alleletype == 0 or comp_alleletype ==1: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) for evalObj in vcf_sample_eval_objects: evalObj.writeEvalOutput() outputfh.write("total records analyzed: " + str(totalrecords) + "\n" ) outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(concordancetable) outputfh.write( str(sum) +"\n") #now we figure out how many sites were called or not called calledtable[0,0]=concordancetable[0:3,0:3].sum() calledtable[0,1]=concordancetable[0:3,3].sum() calledtable[1,0]=concordancetable[3,0:3].sum() calledtable[1,1]=concordancetable[3,3] outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(calledtable) outputfh.write( str(sum) +"\n") outputfh.write("\n") if options.matrixonly == False: discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2] variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) outputfh.write( "NRD: " + str(nrd) +" \n") outputfh.write( "NRS " + str(nrs) +" \n")
def get_prod_stats_from_td(df_timeline): prods = pd.DataFrame() # reorder_rate prods['prod_reorder_rate']=df_timeline.groupby('product_id').\ apply(lambda order: (sum(order.user_prod_no_of_orders-1) / float(sum(order.user_prod_orders_since_first_ordered))) if sum(order.user_prod_orders_since_first_ordered) > 0 else 0.0) # combine reorder_nums_interval list which grouped by product_id group_size = 10000 product_ids = prods.index.tolist() prods = prods.merge(right=papply([df_timeline[df_timeline.product_id.isin(pids)] for pids in grouper(group_size, product_ids)],combine_list_by_prod),\ how='left', left_index=True, right_index=True) # calculate average prods[ 'prod_avg_order_nums_intervals'] = prods.user_prod_reorder_nums_intervals.apply( lambda x: np.mean(x)).astype(np.float16) prods[ 'prod_avg_order_days_intervals'] = prods.user_prod_reorder_days_intervals.apply( lambda x: np.mean(x)).astype(np.float16) prods['product_id'] = prods.index return prods
def processing_data(): ### loading the data IDIR = '../input/' # load the basic data priors, train, orders, products, aisles, departments = common.load_raw_data( IDIR) print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns))) print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns))) print('train {}: {}'.format(train.shape, ', '.join(train.columns))) # load timeline related data timeline_data = get_timeline_data() # load user and product category data user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data( ) ### Preprocessing, combine order, product information into priors print('add order info to priors') orders.set_index('order_id', inplace=True, drop=False) priors = priors.join(orders, on='order_id', rsuffix='_') priors.drop('order_id_', inplace=True, axis=1) ### print('add product info to priors') priors = pd.merge(priors, products, how='left', on='product_id') ### get product statistic data prod_stats = get_prod_stats(priors, timeline_data) print('prod_stats {}: {}'.format(prod_stats.shape, ', '.join(prod_stats.columns))) ### get user statistic data user_stats = get_user_stats(priors, timeline_data) print('user_stats {}: {}'.format(user_stats.shape, ', '.join(user_stats.columns))) ### get userXprod statistic data userXprod_stats = get_userXprod_stats(priors, timeline_data) print('userXprod_stats {}: {}'.format(userXprod_stats.shape, ', '.join(userXprod_stats.columns))) ### get user category statistic data print('add user_cat info to priors') priors = pd.merge(priors, user_cat_data, how='left', on='user_id') user_cat_stats = get_user_cat_stats(priors) print('user_cat_stats {}: {}'.format(user_cat_stats.shape, ', '.join(user_cat_stats.columns))) print('add prod_cat info into priors') priors = pd.merge(priors, prod_cat_data, how='left', on='product_id') prod_cat_stats = get_prod_cat_stats(priors) print('prod_cat_stats {}: {}'.format(prod_cat_stats.shape, ', '.join(prod_cat_stats.columns))) ## question here ## why not merge user_cat_stats and prod_cat_stats? ## Note the function only processed stats for cat_20 ### postprocessing for training data # user userXprod_stat as base to construct train and test data. print('post processing stats data') df_x = userXprod_stats df_x = df_x.merge(prod_stats.drop(['user_prod_reorder_nums_intervals', 'user_prod_reorder_days_intervals'],axis=1),\ how='left',on='product_id') df_x = df_x.merge(user_stats, how='left', on='user_id') df_x = df_x.merge(orders[orders.eval_set != 'prior'], how='left', on='user_id') # merge category data df_x = df_x.merge(user_cat_data, how='left', on='user_id') df_x = df_x.merge(prod_cat_data, how='left', on='product_id') df_x = df_x.merge(user_prod_cat_match_data, how='left', on=['user_id', 'product_id']) #df_train = df_train.merge(user_cat_data,how='left',on='user_id') #df_train = df_train.merge(prod_cat_data,how='left', on='product_id') #df_train=pd.merge(df_train,user_prod_cat_match_data, how='left',on=['user_id', 'product_id']) #df_test = df_test.merge(user_cat_data,how='left',on='user_id') #df_test = df_test.merge(prod_cat_data,how='left', on='product_id') #df_test=pd.merge(df_test,user_prod_cat_match_data, how='left',on=['user_id', 'product_id']) print df_x.shape print df_x.memory_usage() ### release memory after merge del priors del timeline_data del prod_stats, user_stats, userXprod_stats # calculate extra features print('processing extra feature') df_x['user_prod_days_since_last_ordered'] = df_x[ 'user_prod_days_since_last_ordered'] + df_x['days_since_prior_order'] df_x['user_prod_orders_since_last_ordered'] = df_x[ 'user_prod_orders_since_last_ordered'] + 1 print('processing expect values') # combine reorder_nums_interval list which grouped by product_id group_size = 20000 user_ids = df_x.user_id.unique().tolist() df_x = df_x.merge(right=papply([df_x[df_x.user_id.isin(uids)] for uids in grouper(group_size, user_ids)],get_expect_values),\ how='left', left_index=True, right_index=True) # df_x['user_prod_days_prob'] = df_x.apply(lambda x: sp.stats.expon.pdf(x.user_prod_days_since_last_ordered, loc=0,scale=x.prod_avg_order_days_intervals),axis=1) # df_x['user_prod_orders_prob'] = df_x.apply(lambda x: sp.stats.expon.pdf(x.user_prod_orders_since_last_ordered, loc=1,scale=x.prod_avg_order_nums_intervals),axis=1) print('finished processing expect values') # split the train and test for df_x df_train = df_x[df_x.eval_set == 'train'] df_test = df_x[df_x.eval_set == 'test'] # merge the label into df_train df_train = df_train.merge(train, how='left', on=['order_id', 'product_id']) df_train['reordered'] = df_train.reordered.fillna(0) print("processing data finished!") return df_train, df_test
out = eval_batch(valid_merged, valid_logit) out = out.detach().cpu().numpy() f1_best = get_f1_threshold(out, valid_ohs, __best_threshold) print(__best_threshold) print('f1_best(valid)=', f1_best) best_th = threshold_search(out, valid_ohs) f1_best = get_f1_threshold(out, valid_ohs, best_th) print(best_th) print('f1_best(valid, th searched)=', f1_best) out_list = [] for batch, logit in zip(grouper(test_merged, bs), grouper(test_logit, bs)): batch = [ b if isinstance(b, torch.Tensor) else torch.from_numpy(b) for b in batch if b is not None ] logit = [ b if isinstance(b, torch.Tensor) else torch.from_numpy(b) for b in logit if b is not None ] out_batch = net( torch.stack(batch, dim=0).cuda(), torch.stack(logit, dim=0).cuda()) out_list.append(out_batch.detach().cpu().numpy()) out = np.concatenate(out_list, axis=0)
def create_dag(dag_filename, status_filename, condor_filename, log_dir, delphes_zip, args): """Create a htcondenser.DAGMan to run Delphes over a set of files. Parameters ---------- dag_filename: str Name to be used for DAG job file. status_filename: str Name to be used for DAG status file. condor_filename: str Name of condor job file to be used for each job. log_dir : str Name of directory to be used for log files. delphes_zip : str Location of delphes zip file. args: argparse.Namespace Contains info about output directory, job IDs, number of events per job, and args to pass to the executable. Returns ------- htcondenser.DAGMan DAGMan for all delphes jobs. Raises ------ OSError If no files in input directory of correct type (lhe, hepmc, gzipped) """ # Collate list of input files def accept_file(filename, fmt): fl = os.path.basename(filename).lower() comp_ext = ['.gz', '.tar.gz', '.tgz'] extensions = ['.' + fmt.lower() + y for y in comp_ext] return (os.path.isfile(filename) and any([fl.endswith(ext) for ext in extensions]) and not fl.startswith("runmaterial") and not fl.startswith('mg5')) log.debug(os.listdir(args.iDir)) abs_idir = os.path.realpath(args.iDir) input_files = [os.path.join(abs_idir, f) for f in os.listdir(abs_idir) if accept_file(os.path.join(abs_idir, f), args.type)] log.debug(input_files) if not input_files: raise OSError('No acceptable input file in %s' % args.iDir) # Setup DAGMan and JobSet objects # ------------------------------------------------------------------------ log.info("DAG file: %s" % dag_filename) delphes_dag = ht.DAGMan(filename=dag_filename, status_file=status_filename) delphes_jobset = ht.JobSet(exe='HTCondor/runDelphes.py', copy_exe=True, setup_script='HTCondor/setupDelphes.sh', filename='HTCondor/delphes.condor', out_dir=log_dir, err_dir=log_dir, log_dir=log_dir, memory='100MB', disk='2GB', share_exe_setup=True, common_input_files=[delphes_zip, args.card], transfer_hdfs_input=True, hdfs_store=os.path.join(args.oDir, 'materials')) exe_dict = {'hepmc': './DelphesHepMC', 'lhe': './DelphesLHEF'} delphes_exe = exe_dict[args.type] # We assign each job to run over a certain number of input files. files_per_job = 2 for ind, input_files in enumerate(common.grouper(input_files, files_per_job)): input_files = filter(None, input_files) job_args = ['--card', os.path.basename(args.card), '--exe', delphes_exe] # Add --process commands to job opts output_files = [os.path.join(args.oDir, stem(f)) + '.root' for f in input_files] for in_file, out_file in zip(input_files, output_files): job_args.extend(['--process', in_file, out_file]) # Since we transfer across files on a one-by-one basis, we don't use # input_files or output_files for the input or outpt ROOT files. job = ht.Job(name='delphes%d' % ind, args=job_args) delphes_jobset.add_job(job) delphes_dag.add_job(job) return delphes_dag
def main(): usage = "usage: %prog [options] filebasename" parser = OptionParser(usage) parser.add_option("--file", type="string", dest="basename", help="basename of tped/tfam file") parser.add_option("--twobitfile", type="string", dest="tbf", help="2bit file of reference genome") (options, args)=parser.parse_args() try: sys.stderr.write("opening twobitfile...\n") twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) ) except: sys.stderr.write("unable to open twobit file!\n") tfamfile=options.basename+".tfam" tpedfile=options.basename+".tped" tfamfh=open(tfamfile, 'r') samplenames=[] for line in tfamfh: (fid,iid,pid,mid,sex,pheno)=line.strip().split(' ') samplenames.append(iid) samplestring="\t".join(samplenames) tpedfh=open(tpedfile,'r') printvcfHeader(options.tbf, tpedfile) print "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", samplestring]) for line in tpedfh: fields=line.strip().split(' ') (chrom, snpid,cM,pos)=fields[0:4] start=int(pos)-1 end=int(pos) try: sequence=twobit[chrom][start:end] sequence=sequence.upper() except: error="unable to fetch sequence from 2bit file!: " + chrom + " " + pos sys.stderr.write(error + "\n") exit(1) refbase=sequence #print chrom, pos,refbase genotypes=fields[4::] if len(genotypes)/2 != len(samplenames): sys.stderr.write("unequal numbers of genotypes and sample names!\n") sys.exit(1) observed_alleles=set(genotypes) altbases= list( observed_alleles - set(refbase) ) alt='.' if len(altbases) == 0: alt='.' elif len(altbases) > 1: alt=",".join(altbases ) else: alt=altbases[0] metainfo="\t".join([chrom,pos,snpid,refbase,alt,'.','.', 'NS='+str(len(samplenames)),'GT']) ngenotypes=[] for genotype in grouper(2, genotypes,'x'): genostr="".join(list(genotype) ) ngenotypes.append( numericalGenotypes(refbase,alt, genostr) ) #print genotypes #print ngenotypes goutput="\t".join(ngenotypes) print metainfo +"\t" +goutput