def writeEvalOutput(self):
        
        self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
        self.outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
        rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
        for (i, gt) in grouper(2,rownames):
            row=self.concordancetable[i,:].tolist()
            for r in row:
                outstr="\t".join(map(str,r))
                self.outputfh.write( gt +"\t"+outstr+"\n")

        self.outputfh.write( "matrix sum: \n")
        summy=np.sum(self.concordancetable)
        self.outputfh.write( str(summy) +"\n")
        
        
        
        #now we figure out how many sites were called or not called
        self.calledtable[0,0]=self.concordancetable[0:3,0:3].sum()
        self.calledtable[0,1]=self.concordancetable[0:3,3].sum()
        self.calledtable[1,0]=self.concordancetable[3,0:3].sum()
        self.calledtable[1,1]=self.concordancetable[3,3]
        self.outputfh.write("\n")
        rownames=[ 0,'called', 1,'./.' ]
        self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
        self.outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
        for (i, gt) in grouper(2,rownames):
            row=self.calledtable[i,:].tolist()
            for r in row:
                outstr="\t".join(map(str,r))
                self.outputfh.write( gt +"\t"+outstr+"\n")
        self.outputfh.write( "matrix sum: \n")
        summy=np.sum(self.calledtable)
        self.outputfh.write( str(summy) +"\n")
   
        self.outputfh.write("\n")
        
        
        discordance=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1]
        total=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,1]+ self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] +self.concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= self.concordancetable[1,1]+ self.concordancetable[1,2]+ self.concordancetable[2,1]+ self.concordancetable[2,2]
    
        variant_count_comparison= self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,1]+self.concordancetable[1,2]+self.concordancetable[2,1]+self.concordancetable[2,2]+self.concordancetable[3,1]+self.concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        self.outputfh.write( "NRD: " + str(nrd) +" \n")
        self.outputfh.write( "NRS " + str(nrs) +" \n")
        
        outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) )
        self.genotypematrixfh.write(outstring+"\n")
def main():
    usage = "usage: %prog [options] file.vcf \n output format values from  genotype data field  in a merged VCF generated by CombineVariants from GATK for suitabale plotting/dataviz"
    parser = OptionParser(usage)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)")
    (options, args)=parser.parse_args()
    vcfilename=args[0]
    #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf'
    
    basename=os.path.splitext(vcfilename)[0]

    vcfobj=VcfFile(vcfilename)
    vcfh=open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"

    samples=vcfobj.getSampleList()
    print "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        vrec_ziptuple=vrec.zipGenotypes(samples)
        outputs=[]
        for (compare, eval) in grouper(2,vrec_ziptuple):
            compareGenobj= compare[1]
            evalGenobj= eval[1]
        
            outputs.append( "\t".join( [compareGenobj.getFormatVal(options.format),  evalGenobj.getFormatVal(options.format) ] ) )
        print "\t".join(outputs)
 def eval_batch(data_all, logit_all, in_train=False):
     out_list = []
     for batch, logit in zip(grouper(data_all, bs), grouper(logit_all, bs)):
         batch = [
             b if isinstance(b, torch.Tensor) else torch.from_numpy(b)
             for b in batch if b is not None
         ]
         logit = [
             b if isinstance(b, torch.Tensor) else torch.from_numpy(b)
             for b in logit if b is not None
         ]
         out_batch = net(
             torch.stack(batch, dim=0).cuda(),
             torch.stack(logit, dim=0).cuda(), in_train)
         out_list.append(out_batch)
     out = torch.cat(out_list, dim=0)
     return out
Exemple #4
0
def aes_cbc_encrypt(message, key, iv):
    ciphertext = ''
    key_size = len(key)

    for block in common.grouper(message,key_size,fillvalue='\x00'):
        block = ''.join(block) # Convert grouper array into byte string
        ciphertext_cbc = do_aes_cbc_encrypt_chain(block, key, iv)
        iv = ciphertext_cbc
        ciphertext += ciphertext_cbc

    return ciphertext
 def __str__(self):
     rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
     outstring="\t".join(['','AA','AB','BB', './.']) +"\n" 
     
     for (i, gt) in grouper(2,rownames):
         row=self.concordancetable[i,:].tolist()
         for r in row:
             outstr="\t".join(map(str,r))
             outstring+=( gt +"\t"+outstr+"\n")
     outstring+="eval: "+ self.evalname
     outstring+=" compare: "+ self.comparename +"\n"
     return outstring
Exemple #6
0
def is_ecb_mode(ciphertext, key_size=16):
    #print ('Key size: {}'.format(key_size))

    # Break cipher into key sized blocks and store each block as array elem
    cipher_blocks = [b for cipher_block in common.grouper(ciphertext,key_size) for b in [''.join(cipher_block)]]

    # Get count of all similar cipher blocks
    matched_blocks = [{byte:count} for byte,count in Counter(cipher_blocks).items() if count>1]

    # Total up all matched cipher blocks
    total_blocks_match = sum([count for byte in matched_blocks for count in byte.values()])

    #print ('Number of cipher blocks with match were {}, and total matched blocks {}'.format(len(matched_blocks),total_blocks_match))
    return matched_blocks
def main():
    usage = "usage: %prog [options]  nrd.log.vcf\n"
    parser = OptionParser(usage)
    # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)

    (options, args) = parser.parse_args()
    vcfilename = args[0]
    basename = os.path.splitext(vcfilename)[0]

    vcfobj = VcfFile(vcfilename)
    vcfh = open(vcfilename, "r")
    nrdallfh = open(basename + ".allgenos.nrd.txt", "w")
    nrdtwofh = open(basename + ".twogenos.nrd.txt", "w")
    nrdonefh = open(basename + ".onegenos.nrd.txt", "w")
    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples = vcfobj.getSampleList()
    # print samples
    # print "#setname\t" + "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        outputline = [[vrec.getPos()]]

        setname = vrec.returnInfoDict()["set"]  # which callset does the site belong to?

        outputline.append([setname])  # we aggregate genotypes per sample heere

        vrec_ziptuple = vrec.zipGenotypes(samples)
        # print vrec_ziptuple
        """ Since I'm testing against trio, NRD count can be 1 2 or 3
            We keep track of the nrd count and print those records to the appropriate file:
            nrdallfh, nrdtwofh, nrdonefh  """
        nrd_count = 0
        for (compare, eval) in grouper(2, vrec_ziptuple):
            (comp_allele1, comp_allele2) = compare[1].getAlleles()
            (eval_allele1, eval_allele2) = eval[1].getAlleles()
            eval_alleletype = typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype = typeofGenotype(comp_allele1, comp_allele2)
            if eval_alleletype == comp_alleletype:
                continue
            outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)])
            nrd_count += 1

        output = "\t".join(melt_lol(outputline))
        """ depending on the nrd count, print the records to appropirate file(s) """
        if nrd_count == 3:
            nrdallfh.write(output + "\n")
        if nrd_count == 2:
            nrdtwofh.write(output + "\n")
        if nrd_count == 1:
            nrdonefh.write(output + "\n")
Exemple #8
0
def main(options, args):
    if not options.email:
        raise Exception('Must specify -e for profile email id')
    email = options.email

    if options.aes_key and (len(options.aes_key) == 32):
        aes_key = options.aes_key.decode('hex')
    else:
        # aes 128 bits
        aes_key = common_crypt.get_random_byte_string(128 / 8)
    print('aes key:{}'.format(aes_key.encode('hex')))

    # Test decryption mode. cipher text and key must be specified
    if options.cipher_text and options.aes_key:
        cipher_text = options.cipher_text.decode('hex')
        plaintext_profile = common_crypt.do_aes_128_ecb_decryption(
            cipher_text, aes_key)
        print('Decrypted data of profile: {}'.format(plaintext_profile))
        plaintext_profile = parse_key_values(plaintext_profile)
        print('And after parsing: {}'.format(plaintext_profile))
    else:
        # Guess aes block size
        block_size = common_crypt.guess_ecb_block_size(
            common_crypt.do_aes_128_ecb, aes_key, email)
        print('Block size is: {}'.format(block_size))
        cipher_text_profile = tamper_role(email, aes_key)
        hexdump.hexdump(cipher_text_profile)

        plaintext_profile = common_crypt.do_aes_128_ecb_decryption(
            cipher_text_profile, aes_key)
        print('Decrypted cipher text from attack is: {}'.format(
            plaintext_profile))

        plaintext_profile = parse_key_values(plaintext_profile)
        print('And after parsing: {}'.format(plaintext_profile))

        # break up cipher text into block size
        for cipher_text_block in common.grouper(cipher_text_profile,
                                                block_size):
            cipher_text_block = ''.join(cipher_text_block)
            hexdump.hexdump(cipher_text_block)
            print(
                common_crypt.do_aes_128_ecb_decryption(cipher_text_block,
                                                       aes_key))
Exemple #9
0
def aes_cbc_decrypt(message, key, iv):
    key_size = len(key)
    round = 0
    previous_cipher_block = ''
    plaintext = ''

    for cipher_block in common.grouper(message,key_size,fillvalue='\x00'):
        cipher_block = ''.join(cipher_block) # Convert grouper array into byte string

        # First round of chain uses the specified iv instead of previous decrypted block
        if round == 0:
            plaintext_block = do_aes_cbc_decrypt_chain(cipher_block, key, iv)
        else:
            iv = previous_cipher_block
            plaintext_block = do_aes_cbc_decrypt_chain(cipher_block, key, iv)

        previous_cipher_block = cipher_block # This is needed for next round iv
        plaintext += plaintext_block
        round += 1

    return plaintext
def main():
    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
    parser = OptionParser(usage)
    
    parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    (options, args)=parser.parse_args()

    vcfilename=args[0]
    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
    """ row is eval, column is comparison 
        make a numpy matrix to represent genotype concordance matrix """
    
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )
    
    #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix
    outputfile=".".join([basename, 'variantEval','txt'])
    outputfh=open(outputfile, 'w')
    #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set
    nrslog=".".join([basename, 'nrs','log'])
    nrdlog=".".join([basename, 'nrd','log'])
    filterlog=".".join([basename, 'filtered','log'])
    multialleliclog=".".join([basename, 'multiallelic','log'])
    concordancelog=".".join([basename, 'concordance','log'])
    fieldslog=".".join([basename, 'fields', 'log'])
    nrsfh=open(nrslog, 'w')
    nrdfh=open(nrdlog, 'w')
    filteredfh=open(filterlog, 'w')
    multifh=open(multialleliclog, 'w')
    concordancefh=open(concordancelog, 'w')
    fieldsfh=open(fieldslog, 'w')
    fieldsfh.write('set'+"\n")
    vcfobj=VcfFile(vcfilename)
    vcfh=gzip.open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"
    
    nrsfh.write(header)
    nrdfh.write(header)
    filteredfh.write(header)
    concordancefh.write(header)
    multifh.write(header)
    #outputfh.write(header)
    #multifh.write(header)

    samples=vcfobj.getSampleList()
    
    #for (comparename, evalname) in grouper(2,samples):
    #    print comparename, evalname
    vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for  (compare,eval) in grouper(2,samples) ] 
    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeHeaders(header)
    
    totalrecords=0

    pattern=';set=(\S+)'
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if ',' in vrec.getAlt() > 1:
            outstring=vrec.toStringwithGenotypes() + "\n"
            multifh.write(outstring)
            #continue


        """ skip homoz reference calls unless you want to include them!  """
        if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False:
            continue

        """ if variant is filtered, skip it! """
        if 'filterIn' in vrec.getInfo() and options.includeFilter == False:
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        if 'FilteredInAll' in vrec.getInfo():
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        #returns a list [ (samplename, vcfgenotype) , ... () ]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        """ we make a hack and make a list like so:
           [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ...   ] 
           basically it halves the length of vrec_ziptuple and gives it the same structure
           as the list of VcfSampleEval objects"""
        compare_eval =[ compare+evalu  for (compare,evalu) in grouper(2,vrec_ziptuple) ]
        
       
        #what set are you in?
        field=re.search(pattern, vrec.getInfo()).groups()[0]
        fieldsfh.write(field+"\n")
        totalrecords+=1
        """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype  """
        
        for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects):
            
            #print genotype_tuple
            compare=genotype_tuple[0:2]
            eval=genotype_tuple[2::]
            #print compare
            #print eval
            

           
                
            (comp_allele1, comp_allele2)=compare[1].getAlleles()
            (eval_allele1, eval_allele2)=eval[1].getAlleles()

            eval_alleletype=typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype=typeofGenotype(comp_allele1, comp_allele2)
           
            """ increment the cell count  """
            concordancetable[eval_alleletype, comp_alleletype]+=1
            evalObj.incrementcellcount(eval_alleletype,comp_alleletype)


            """write gentoype record to log appropriate log file """
            #print records that contirubut the NRS penalty
            if eval_alleletype == 3:
                if comp_alleletype == 1 or comp_alleletype==2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring)
                    evalObj.writeNrs(outstring)
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring )
                    evalObj.writeNrs(outstring)
    
        
            #print records that contribute to NRD penalty
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 0:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 1:
                if comp_alleletype == 0 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 2:
                if comp_alleletype == 0 or comp_alleletype ==1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)

    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeEvalOutput()
    
    outputfh.write("total records analyzed: " + str(totalrecords) + "\n" )

    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
    rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
    for (i, gt) in grouper(2,rownames):
        row=concordancetable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")

    outputfh.write( "matrix sum: \n")
    sum=np.sum(concordancetable)
    outputfh.write( str(sum) +"\n")

    #now we figure out how many sites were called or not called
    calledtable[0,0]=concordancetable[0:3,0:3].sum()
    calledtable[0,1]=concordancetable[0:3,3].sum()
    calledtable[1,0]=concordancetable[3,0:3].sum()
    calledtable[1,1]=concordancetable[3,3]
    outputfh.write("\n")
    rownames=[ 0,'called', 1,'./.' ]
    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
    for (i, gt) in grouper(2,rownames):
        row=calledtable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")
    outputfh.write( "matrix sum: \n")
    sum=np.sum(calledtable)
    outputfh.write( str(sum) +"\n")
   
    outputfh.write("\n")


    if options.matrixonly == False:
        discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1]
        total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2]
    
        variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        outputfh.write( "NRD: " + str(nrd) +" \n")
        outputfh.write( "NRS " + str(nrs) +" \n")
def get_prod_stats_from_td(df_timeline):

    prods = pd.DataFrame()

    # reorder_rate
    prods['prod_reorder_rate']=df_timeline.groupby('product_id').\
        apply(lambda order: (sum(order.user_prod_no_of_orders-1) / float(sum(order.user_prod_orders_since_first_ordered))) if sum(order.user_prod_orders_since_first_ordered) > 0 else 0.0)
    # combine reorder_nums_interval list which grouped by product_id
    group_size = 10000
    product_ids = prods.index.tolist()
    prods = prods.merge(right=papply([df_timeline[df_timeline.product_id.isin(pids)] for pids in grouper(group_size, product_ids)],combine_list_by_prod),\
                how='left', left_index=True, right_index=True)
    # calculate average
    prods[
        'prod_avg_order_nums_intervals'] = prods.user_prod_reorder_nums_intervals.apply(
            lambda x: np.mean(x)).astype(np.float16)
    prods[
        'prod_avg_order_days_intervals'] = prods.user_prod_reorder_days_intervals.apply(
            lambda x: np.mean(x)).astype(np.float16)
    prods['product_id'] = prods.index
    return prods
def processing_data():
    ### loading the data
    IDIR = '../input/'

    # load the basic data
    priors, train, orders, products, aisles, departments = common.load_raw_data(
        IDIR)

    print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
    print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
    print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

    # load timeline related data
    timeline_data = get_timeline_data()

    # load user and product category data
    user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data(
    )

    ### Preprocessing, combine order, product information into priors
    print('add order info to priors')
    orders.set_index('order_id', inplace=True, drop=False)
    priors = priors.join(orders, on='order_id', rsuffix='_')
    priors.drop('order_id_', inplace=True, axis=1)

    ###
    print('add product info to priors')
    priors = pd.merge(priors, products, how='left', on='product_id')

    ### get product statistic data
    prod_stats = get_prod_stats(priors, timeline_data)
    print('prod_stats {}: {}'.format(prod_stats.shape,
                                     ', '.join(prod_stats.columns)))

    ### get user statistic data
    user_stats = get_user_stats(priors, timeline_data)
    print('user_stats {}: {}'.format(user_stats.shape,
                                     ', '.join(user_stats.columns)))

    ### get userXprod statistic data
    userXprod_stats = get_userXprod_stats(priors, timeline_data)
    print('userXprod_stats {}: {}'.format(userXprod_stats.shape,
                                          ', '.join(userXprod_stats.columns)))

    ### get user category statistic data
    print('add user_cat info to priors')
    priors = pd.merge(priors, user_cat_data, how='left', on='user_id')

    user_cat_stats = get_user_cat_stats(priors)
    print('user_cat_stats {}: {}'.format(user_cat_stats.shape,
                                         ', '.join(user_cat_stats.columns)))

    print('add prod_cat info into priors')
    priors = pd.merge(priors, prod_cat_data, how='left', on='product_id')

    prod_cat_stats = get_prod_cat_stats(priors)
    print('prod_cat_stats {}: {}'.format(prod_cat_stats.shape,
                                         ', '.join(prod_cat_stats.columns)))

    ## question here
    ## why not merge user_cat_stats and prod_cat_stats?
    ## Note the function only processed stats for cat_20

    ### postprocessing for training data
    # user userXprod_stat as base to construct train and test data.
    print('post processing stats data')
    df_x = userXprod_stats
    df_x = df_x.merge(prod_stats.drop(['user_prod_reorder_nums_intervals', 'user_prod_reorder_days_intervals'],axis=1),\
                      how='left',on='product_id')
    df_x = df_x.merge(user_stats, how='left', on='user_id')
    df_x = df_x.merge(orders[orders.eval_set != 'prior'],
                      how='left',
                      on='user_id')

    # merge category data
    df_x = df_x.merge(user_cat_data, how='left', on='user_id')
    df_x = df_x.merge(prod_cat_data, how='left', on='product_id')
    df_x = df_x.merge(user_prod_cat_match_data,
                      how='left',
                      on=['user_id', 'product_id'])

    #df_train = df_train.merge(user_cat_data,how='left',on='user_id')
    #df_train = df_train.merge(prod_cat_data,how='left', on='product_id')
    #df_train=pd.merge(df_train,user_prod_cat_match_data, how='left',on=['user_id', 'product_id'])
    #df_test = df_test.merge(user_cat_data,how='left',on='user_id')
    #df_test = df_test.merge(prod_cat_data,how='left', on='product_id')
    #df_test=pd.merge(df_test,user_prod_cat_match_data, how='left',on=['user_id', 'product_id'])

    print df_x.shape
    print df_x.memory_usage()

    ### release memory after merge
    del priors
    del timeline_data
    del prod_stats, user_stats, userXprod_stats

    # calculate extra features
    print('processing extra feature')
    df_x['user_prod_days_since_last_ordered'] = df_x[
        'user_prod_days_since_last_ordered'] + df_x['days_since_prior_order']
    df_x['user_prod_orders_since_last_ordered'] = df_x[
        'user_prod_orders_since_last_ordered'] + 1

    print('processing expect values')
    # combine reorder_nums_interval list which grouped by product_id
    group_size = 20000
    user_ids = df_x.user_id.unique().tolist()

    df_x = df_x.merge(right=papply([df_x[df_x.user_id.isin(uids)] for uids in grouper(group_size, user_ids)],get_expect_values),\
                how='left', left_index=True, right_index=True)
    # df_x['user_prod_days_prob'] = df_x.apply(lambda x: sp.stats.expon.pdf(x.user_prod_days_since_last_ordered, loc=0,scale=x.prod_avg_order_days_intervals),axis=1)
    # df_x['user_prod_orders_prob'] = df_x.apply(lambda x: sp.stats.expon.pdf(x.user_prod_orders_since_last_ordered, loc=1,scale=x.prod_avg_order_nums_intervals),axis=1)
    print('finished processing expect values')

    # split the train and test for df_x
    df_train = df_x[df_x.eval_set == 'train']
    df_test = df_x[df_x.eval_set == 'test']

    # merge the label into df_train
    df_train = df_train.merge(train, how='left', on=['order_id', 'product_id'])
    df_train['reordered'] = df_train.reordered.fillna(0)

    print("processing data finished!")
    return df_train, df_test
    out = eval_batch(valid_merged, valid_logit)
    out = out.detach().cpu().numpy()

    f1_best = get_f1_threshold(out, valid_ohs, __best_threshold)

    print(__best_threshold)
    print('f1_best(valid)=', f1_best)

    best_th = threshold_search(out, valid_ohs)
    f1_best = get_f1_threshold(out, valid_ohs, best_th)

    print(best_th)
    print('f1_best(valid, th searched)=', f1_best)

    out_list = []
    for batch, logit in zip(grouper(test_merged, bs), grouper(test_logit, bs)):
        batch = [
            b if isinstance(b, torch.Tensor) else torch.from_numpy(b)
            for b in batch if b is not None
        ]
        logit = [
            b if isinstance(b, torch.Tensor) else torch.from_numpy(b)
            for b in logit if b is not None
        ]
        out_batch = net(
            torch.stack(batch, dim=0).cuda(),
            torch.stack(logit, dim=0).cuda())
        out_list.append(out_batch.detach().cpu().numpy())

    out = np.concatenate(out_list, axis=0)
def create_dag(dag_filename, status_filename, condor_filename, log_dir, delphes_zip, args):
    """Create a htcondenser.DAGMan to run Delphes over a set of files.

    Parameters
    ----------
    dag_filename: str
        Name to be used for DAG job file.
    status_filename: str
        Name to be used for DAG status file.
    condor_filename: str
        Name of condor job file to be used for each job.
    log_dir : str
        Name of directory to be used for log files.
    delphes_zip : str
        Location of delphes zip file.
    args: argparse.Namespace
        Contains info about output directory, job IDs, number of events per job,
        and args to pass to the executable.

    Returns
    -------
    htcondenser.DAGMan
        DAGMan for all delphes jobs.

    Raises
    ------
    OSError
        If no files in input directory of correct type (lhe, hepmc, gzipped)

    """
    # Collate list of input files
    def accept_file(filename, fmt):
        fl = os.path.basename(filename).lower()
        comp_ext = ['.gz', '.tar.gz', '.tgz']
        extensions = ['.' + fmt.lower() + y for y in comp_ext]
        return (os.path.isfile(filename) and
                any([fl.endswith(ext) for ext in extensions]) and
                not fl.startswith("runmaterial") and
                not fl.startswith('mg5'))

    log.debug(os.listdir(args.iDir))
    abs_idir = os.path.realpath(args.iDir)
    input_files = [os.path.join(abs_idir, f) for f in os.listdir(abs_idir)
                   if accept_file(os.path.join(abs_idir, f), args.type)]
    log.debug(input_files)
    if not input_files:
        raise OSError('No acceptable input file in %s' % args.iDir)

    # Setup DAGMan and JobSet objects
    # ------------------------------------------------------------------------
    log.info("DAG file: %s" % dag_filename)
    delphes_dag = ht.DAGMan(filename=dag_filename, status_file=status_filename)

    delphes_jobset = ht.JobSet(exe='HTCondor/runDelphes.py', copy_exe=True,
                               setup_script='HTCondor/setupDelphes.sh',
                               filename='HTCondor/delphes.condor',
                               out_dir=log_dir, err_dir=log_dir, log_dir=log_dir,
                               memory='100MB', disk='2GB',
                               share_exe_setup=True,
                               common_input_files=[delphes_zip, args.card],
                               transfer_hdfs_input=True,
                               hdfs_store=os.path.join(args.oDir, 'materials'))

    exe_dict = {'hepmc': './DelphesHepMC', 'lhe': './DelphesLHEF'}
    delphes_exe = exe_dict[args.type]

    # We assign each job to run over a certain number of input files.
    files_per_job = 2
    for ind, input_files in enumerate(common.grouper(input_files, files_per_job)):
        input_files = filter(None, input_files)

        job_args = ['--card', os.path.basename(args.card), '--exe', delphes_exe]

        # Add --process commands to job opts
        output_files = [os.path.join(args.oDir, stem(f)) + '.root' for f in input_files]
        for in_file, out_file in zip(input_files, output_files):
            job_args.extend(['--process', in_file, out_file])

        # Since we transfer across files on a one-by-one basis, we don't use
        # input_files or output_files for the input or outpt ROOT files.
        job = ht.Job(name='delphes%d' % ind, args=job_args)
        delphes_jobset.add_job(job)
        delphes_dag.add_job(job)

    return delphes_dag
Exemple #15
0
def main():
    usage = "usage: %prog [options] filebasename"
    parser = OptionParser(usage)
    parser.add_option("--file", type="string", dest="basename", help="basename of tped/tfam file")
    parser.add_option("--twobitfile", type="string", dest="tbf", help="2bit file of reference genome")


    (options, args)=parser.parse_args()
    

    try:
        sys.stderr.write("opening twobitfile...\n")
        twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) )
    except:
       sys.stderr.write("unable to open twobit file!\n")

    tfamfile=options.basename+".tfam"
    tpedfile=options.basename+".tped"

    tfamfh=open(tfamfile, 'r')
    samplenames=[]
    for line in tfamfh:
        (fid,iid,pid,mid,sex,pheno)=line.strip().split(' ')
        samplenames.append(iid)

    samplestring="\t".join(samplenames)


    tpedfh=open(tpedfile,'r')
    printvcfHeader(options.tbf, tpedfile)
    print "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",  "INFO",  "FORMAT", samplestring])
    for line in  tpedfh:
        fields=line.strip().split(' ')
        (chrom, snpid,cM,pos)=fields[0:4]
        start=int(pos)-1
        end=int(pos)
        
        try:
            sequence=twobit[chrom][start:end]
            sequence=sequence.upper()
        except:
            error="unable to fetch sequence from 2bit file!: " + chrom + " " + pos
            sys.stderr.write(error + "\n")
            exit(1)
        refbase=sequence
        #print chrom, pos,refbase
        genotypes=fields[4::]
        if len(genotypes)/2 != len(samplenames):
            sys.stderr.write("unequal numbers of genotypes and sample names!\n")
            sys.exit(1)
        observed_alleles=set(genotypes)
        altbases= list( observed_alleles - set(refbase) )
        alt='.'
        if len(altbases) == 0:
            alt='.'
        elif len(altbases) > 1:
            alt=",".join(altbases )
        else:
            alt=altbases[0]

        
        metainfo="\t".join([chrom,pos,snpid,refbase,alt,'.','.', 'NS='+str(len(samplenames)),'GT'])
        
        ngenotypes=[]
        for genotype in grouper(2, genotypes,'x'):
            genostr="".join(list(genotype) )
            ngenotypes.append( numericalGenotypes(refbase,alt, genostr) )

        #print genotypes
        #print ngenotypes
        goutput="\t".join(ngenotypes)
        print metainfo +"\t" +goutput