Example #1
0
    def get_Gallup_country_lookups(verbose=True):
        """ Kosovo is the only GWP country not matched to a 3-letter ISO code. Let's ignore it.
        """
        dfr = pd.read_table(__local_input_path__+'GallupWorldPoll-region-country.tsv').rename(columns={'country':'rcountry'})
        dfr['lccountry'] = dfr.rcountry.str.lower()
        dfr = dfr.set_index('lccountry')
        dfw = pd.read_table(__local_input_path__+'GallupWorldPoll-WP5-defs-2016.tsv').rename(columns={'country':'wcountry'})
        dfw['lccountry'] = dfw.wcountry.str.lower()
        dfw = dfw.set_index('lccountry')
        wp5s = pd.read_table(__local_input_path__ +'countrycode_main.tsv',  skiprows=3).set_index('country_GWP3_wp5')
        wp5s = wp5s[['countryCode_GWP3_wp5', 'countryCode_ISO3','country_bestShortName','country_bestName','twoletter_AlexShultz_svg']]
        df= wp5s.join(dfr).join(dfw).rename(columns = {'countryCode_ISO3':'ISO',})
        df.index.name = 'country'
        assert 'South Africa'.lower() in dfr.rcountry
        assert 'South Africa'.lower() in df.index


        # Now several checks:
        # Did regions get their ISO?
        problems = {
            ' Published WHR country lacks an ISO: ': df[pd.notnull(df.rcountry) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','rcountry']],
            ' Published WHR country lacks a WP5: ': df[pd.notnull(df.rcountry) & pd.isnull(df.WP5)],
            ' Published WHR country lacks a map code: ': df[pd.notnull(df.rcountry) & pd.isnull(df.twoletter_AlexShultz_svg)],
            ' Old Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.countryCode_GWP3_wp5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']],
            ' 2016 Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.WP5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']],
        }

        if verbose:
            for tt,dd in problems.items():
                if not dd.empty:
                    print('\n\n -- country_tools WARNING: '+tt)
                    print dd
        return df.reset_index()
Example #2
0
def main():

    parser = argparse.ArgumentParser(description="Extract fasta file.")
    parser.add_argument('-table1',nargs=1,type=str,help="First table.")
    parser.add_argument('-table2',nargs=1,type=str,help="Second table.")
    parser.add_argument('-table3',nargs=1,type=str,help="Third table.")
    args = parser.parse_args()



    #load tables

    table1 = pandas.read_table(args.table1[0])
    table1.index = table1['Unnamed: 0']
    table2 = pandas.read_table(args.table2[0])
    table2.index = table2['Unnamed: 0']
    table3 = pandas.read_table(args.table3[0])
    table3.index = table3['Unnamed: 0']



    print '\n' + args.table1[0] + '\n'
    print 'Number p-value <= 0.05: '+str(len(table1))
    print 'Number FDR <= 0.05: '+str(sum(table1.FDR<=0.05))+'\n'

    print '\n' + args.table2[0] + '\n'
    print 'Number p-value <= 0.05: '+str(len(table2))
    print 'Number FDR <= 0.05: '+str(sum(table2.FDR<=0.05))+'\n'

    print '\n' + args.table3[0] + '\n'
    print 'Number p-value <= 0.05: '+str(len(table3))
    print 'Number FDR <= 0.05: '+str(sum(table3.FDR<=0.05))+'\n'


    set1 = sets.Set(table1.index)
    set2 = sets.Set(table2.index)
    set3 = sets.Set(table3.index)

    print 'Overlapping statistics'+'\n'
    print 'Intersection (p-value<=0.05)'
    print args.table1[0] + ' and ' + args.table2[0] + ': \n' + str(len(set1.intersection(set2)))
    print args.table1[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3)))
    print args.table3[0] + ' and ' + args.table2[0] + ': \n' + str(len(set3.intersection(set2)))
    print args.table1[0] + ' and ' + args.table2[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3.intersection(set2))))

    table1sub = table1[table1.FDR<=0.05]
    table2sub = table2[table2.FDR<=0.05]
    table3sub = table3[table3.FDR<=0.05]

    set1 = sets.Set(table1sub.index)
    set2 = sets.Set(table2sub.index)
    set3 = sets.Set(table3sub.index)

    print '\n\nIntersection (FDR<=0.05)'
    print args.table1[0] + ' and ' + args.table2[0] + ': \n' + str(len(set1.intersection(set2)))
    print args.table1[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3)))
    print args.table3[0] + ' and ' + args.table2[0] + ': \n' + str(len(set3.intersection(set2)))
    print args.table1[0] + ' and ' + args.table2[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3.intersection(set2))))
    #pdb.set_trace()
    sys.exit()
def main():

        # define input/output
        options = get_options()

        # read summary stats file
        stats_file = options.stats
        stats = pd.read_table(stats_file, sep="\t")

	# read region file
	regions_file = options.regions
	regions = pd.read_table(regions_file, header=None, names=['index_snp','locus'])

	regions['chr'] = regions['locus'].str.extract('chr(.*):').astype(int)
	regions['start'] = regions['locus'].str.extract(':(.*)-').astype(int)
	regions['end'] = regions['locus'].str.extract('-(.*)').astype(int)

	# iterate through regions
	subset_list = []

	for index, row in regions.iterrows():
		
		subset = stats[np.logical_and(stats['CHROM'] == row['chr'],
			np.logical_and(stats['POS'] >= row['start'], stats['POS'] <= row['end']))]
		subset.is_copy = False
		subset['index_snp'] = row['index_snp']
		subset_list.append(subset)

	# concatenate list of dataframes
	stats_subset = pd.concat(subset_list)

        # write exclusion file
        out_file = options.out_file
        stats_subset.to_csv(out_file, sep=' ', index=False)
Example #4
0
def simple_expected_result():

    melano = u"""Chromosome Bin chrX/ChIP_1_melanocyte.bed.gz chrX/ChIP_2_melanocyte.bed.gz chrX/Input_1_melanocyte.bed.gz chrX/Input_2_melanocyte.bed.gz Enriched_melanocyte
chr1 200 0.0 0.0 0.0 0.0 0.0
chr1 400 0.0 0.0 0.0 0.0 0.0
chr1 600 0.0 0.0 0.0 0.0 1.0
chr1 800 0.0 2.0 0.0 0.0 1.0
chr1 1000 0.0 0.0 0.0 0.0 1.0
chr1 1200 13.0 128.0 2.0 2.0 1.0
chr1 1400 0.0 0.0 0.0 0.0 0.0
chr1 1600 0.0 0.0 0.0 0.0 0.0"""

    fibro = u"""Chromosome Bin chrX/ChIP_1_fibroblast.bed.gz chrX/ChIP_2_fibroblast.bed.gz chrX/Input_1_fibroblast.bed.gz chrX/Input_2_fibroblast.bed.gz Enriched_fibroblast
chr1 200 0.0 0.0 0.0 0.0 1
chr1 400 0.0 0.0 0.0 0.0 1
chr1 600 0.0 0.0 0.0 0.0 1
chr1 800 0.0 2.0 0.0 0.0 1
chr1 1000 0.0 0.0 0.0 0.0 1
chr1 1200 13.0 128.0 2.0 2.0 1
chr1 1400 0.0 0.0 0.0 0.0 1
chr1 1600 0.0 0.0 0.0 0.0 1"""

    od = OrderedDict()
    od["melano"] = pd.read_table(StringIO(melano), sep="\s+", index_col=[0, 1])

    od["fibro"] = pd.read_table(StringIO(fibro), sep="\s+", index_col=[0, 1])

    return od
def train(sparkContext):
	Utils.logMessage("\nClassification model started")
	pd.read_table(pv.processedFile, sep=',',encoding='utf-8').to_csv(pv.processedFile, header=False, index=False,encoding='utf-8')
	truncatedAccounts = sparkContext.textFile(pv.processedFile).take(pv.truncateLineCount - 1)
	rawData = sparkContext.parallelize(truncatedAccounts).map(countByFeatures).map(lambda item: LabeledPoint(item[0], Vectors.dense(item[2:])))

	trainWithParam(sparkContext, rawData, 0.7, 'entropy', 4, 16)
Example #6
0
 def __init__(self,args):
     if args.window_type not in ['BP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile) #
     af1 = self.get_allele_frequency(bed_1,args) #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af = af1[bed_1_index] #
     if args.afile is not None:
         a1 =  pd.read_table(args.afile,header=None,sep='\s*',
                             names=['id1','id2','theta'])
     else:
         a1 = None
     self.af = af
     self.M = len(bed_1_index) #
     self.windows = self.get_windows(pos,args) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
Example #7
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print "Running Combat...",
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print "Moved original expression file to:"
        print "\t" + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(expr_input_dir, removeExtra="#")  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    # print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print "Original expression file over-written with batch effect removal results..."
    ebat.to_csv(expr_input_dir, sep="\t")
Example #8
0
def runSharesPSRCToBKRZones():
    #list of two lists
    files_shares = [files_manu_shares, file_wtcu_shares]
    header_rows = 3 #number of rows at the begining of a file with header information

    headers = {} #dictionary to save header information
    for files_group in files_shares:
        for file in files_group:
            print("working on file: " + file)
            file_path = os.path.join(wd, file)

            #read header - use "#" as seperator as it is less likely to present in the file
            headers[file] = pd.read_table(file_path, delimiter = "#", header = None, nrows = header_rows) 
        
            # skip first few rows, as they contain general information - also ignore rows starting with 'c' (comment lines)
            shares_psrc = pd.read_table(file_path, delimiter = " ", names = ["o","d",file], comment = "c", skiprows = header_rows)

            if file == files_group[0]:
                #if first file in the group, set to the file shares
                truck_shares_psrc = shares_psrc
            else:
                #add a new column for a new file
                truck_shares_psrc = pd.merge(truck_shares_psrc, shares_psrc, on = ["o","d"])

        # merge psrc to bkr correspondence with percent
        tazGroups = pd.merge(truck_shares_psrc, tazShares, left_on = "o", right_on = "psrc_zone_id")
        tazGroups[file] = tazGroups[file] * tazGroups["percent"]

        # group by unique pair of bkr zone and group
        tazGroups_grouped = tazGroups.groupby(["bkr_zone_id"])

        # calculate sum of percent by unique pair
        tazGroups_sum = tazGroups_grouped[files_group].sum()
        tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1)

        for file in files_group:
            tazGroups_sum[file] *= 1/tazGroups_sum['sum'] 

        tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1)
        tazGroups_sum =  tazGroups_sum.round(4) #round values to 4 decimal

        #temp = tazGroups_sum.ix[tazGroups_sum["sum"]>1.0] #debug: to find out rows that have sum value more than 1

        tazGroups_sum = tazGroups_sum[files_group].reset_index() # makes object a data frame by setting the current index to a column
        tazGroups_sum["c"] = "all:"

        for file in files_group:
            tazGroups_bkr = tazGroups_sum[["bkr_zone_id", "c", file]]
            tazGroups_bkr = tazGroups_bkr.sort_values(by = ['bkr_zone_id'], ascending=[True])

            # write - first header and then append the updated data
            outfile = file.split(".")[0]
            outfile = os.path.join(wd, outfile + "_bkr.in")

            #first write header
            headers[file].to_csv(outfile, sep = " ", header = False, index = False, quoting=csv.QUOTE_NONE, escapechar = " ") #had to add space as escapechar otherwise throws an error - not sure if that would cause any issue in the mdoel

            #write data
            with open(outfile, 'a') as wfile:
                tazGroups_bkr.to_csv(wfile, sep = " " , header = False, index = False)
Example #9
0
def biscorr(maxitems=10,fig=None,ax=None):
    ''
    from string import lstrip
    
    biscorr49 = pandas.read_table('/home/ewout/Dropbox/RIRT/dsc49-biscorr.csv',header=None,names=['Q','biscorr'])
    biscorr89 = pandas.read_table('/home/ewout/Dropbox/RIRT/dsc89-biscorr.csv',header=None,names=['Q','biscorr'])

    stripV = lambda s: lstrip(s,'V')
    biscorr49['Q'] = biscorr49['Q'].apply(stripV)
    biscorr89['Q'] = biscorr89['Q'].apply(stripV)
    biscorr49 = biscorr49.sort(columns='biscorr')
    biscorr89 = biscorr89.sort(columns='biscorr')
    biscorr49 = biscorr49[0:maxitems]
    biscorr89 = biscorr89[0:maxitems]
    
    if not fig:
        fig = plt.figure()
    if not ax:
        ax = fig.add_subplot(111)
        ax.set_title(u"Correlação biserial")

    fig,ax = orderedfig(biscorr89['Q'],biscorr89['biscorr'],biscorr49['Q'],biscorr49['biscorr'],maxitems,fig,ax)

    ax.set_xlabel(u"")
    ax.set_ylim(0,0.2)

    ax.set_xticks([])
    ax.text(0.5,-0.1,u"Item",clip_on=False,transform = ax.transAxes,ha='center')


    return fig,ax
Example #10
0
def Cleaning():
    nanoClean = pd.read_table("nanoflex clean.txt")
    nanoClean.columns = ['V', 'I']

    macroClean = pd.read_table("clean.txt")
    macroClean.columns = ['V', 'I']
    return nanoClean, macroClean
Example #11
0
def ChronoAmp():
    CA_Nano = pd.read_table("chronoampnano.txt")
    CA_Nano.columns = ['t', 'I']

    CA_Macro = pd.read_table("chronoamp macro.txt")
    CA_Macro.columns = ['t', 'I']
    return CA_Nano, CA_Macro
Example #12
0
def main(args):
    logging.info("Reading sample info")
    sample_info = pd.read_table(args.sample_info, header=None, index_col=0, names=['avg_read_len'])
    logging.info("Reading gene lengths")
    gene_lengths = pd.read_table(args.gene_lengths, header=None, index_col=0, names=['gene_id','gene_length'])

    df = pd.DataFrame()

    for fn, sample_name in zip(args.coverage_files, args.sample_names):
        logging.info("Calculating TPM for "+ sample_name)
        ## Read counts per gene for sample
        rg = pd.read_table(fn, index_col=0, header=None, names=['gene_id', 'count'])
        ## Intersect with genes in the gene length file
        rg = rg.loc[list(set(gene_lengths.index).intersection(set(rg.index)))]
        gene_lengths = gene_lengths.loc[list(rg.index)]
        ## Average read length for sample
        rl = sample_info.ix[sample_name,'avg_read_len']
        ## Calculate T for sample
        T = rl * rg['count'].divide(gene_lengths['gene_length']).sum()
        ## Calculate TPM for sample
        tpm = ((1e6*rl)/float(T))*(rg['count'].divide(gene_lengths['gene_length']))
        ## Create dataframe
        TPM = pd.DataFrame(tpm,columns=[sample_name])
        ## Concatenate to results
        df = pd.concat([df,TPM],axis=1)
    ## Write to file
    df.to_csv(sys.stdout, sep='\t')
    logging.info("Done")
Example #13
0
def Comparison():
    nanoComp = pd.read_table("nanoflex_comparison.txt")
    nanoComp.columns = ['V', 'I']

    macroComp = pd.read_table("macro_comparison.txt")
    macroComp.columns = ['V', 'I']
    return nanoComp, macroComp
Example #14
0
def main(args):
    # Import data
    logger.info("Importing Data")
    dat = pd.read_table(args.fname, comment="#")
    dat.set_index(args.uniqID, inplace=True)

    # Prepare Figure
    ## Title
    if args.title:
        title = args.title
    else:
        title = "{0} vs {1} vs {2}".format(args.x, args.y, args.z)

    fig = plt.figure(figsize=(8, 5))
    ax = fig.add_subplot(111, projection="3d")
    fig.suptitle(title)
    if args.xlab:
        xlab = args.xlab
    else:
        xlab = args.x

    if args.ylab:
        ylab = args.ylab
    else:
        ylab = args.y

    if args.zlab:
        zlab = args.zlab
    else:
        zlab = args.z

    # Make plots
    if args.dname and args.group:
        # If group information give color by group.
        design = pd.read_table(args.dname)
        design.set_index("sampleID", inplace=True)
        merged = dat.join(design, how="left")
        grp = merged.groupby(args.group)
        cmap = getColors(grp.indices.keys())

        for i, val in grp:
            c = cmap[i]
            xs = val[args.x]
            ys = val[args.y]
            zs = val[args.z]
            ax.scatter(xs, ys, zs, c=c, s=100, label=i)
        buildLegend(ax, cmap)

    else:
        # Else just plot.
        xs = dat[args.x]
        ys = dat[args.y]
        zs = dat[args.z]
        ax.scatter(xs, ys, zs, s=100)

    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    ax.set_zlabel(ylab)

    galaxySavefig(fig, args.fig)
Example #15
0
def clean_import_scsnv():
    """read each chr 1-22 and X/Y from the dbscSNV download into a dict of dataframes 
    for further processing"""
    
    chrom_dict = {}
    cols = [0,1,2,3,16,17]
    col_names = ['CHROM', 'POS', 'REF', 'ALT', 'ada_score', 'rf_score']

    for i in range(23):
        if i > 0:
            chrom_dict[str(i)]  = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t', 
                na_values = '.', usecols=cols, names=col_names, header=0)
                
    chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t', 
                na_values = '.', usecols=cols, names=col_names, header=0))
                
    chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t', 
                na_values = '.', usecols=cols, names=col_names, header=0))
                
    #for i in range(23):
    #    if i > 0:
    #        chrom_dict[str(i)]  = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t', 
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)
    #            
    #chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t', 
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000))
    #            
    #chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t', 
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000))
    #
    return chrom_dict  
Example #16
0
def _parse_table(f, hdr, dao_type):
    if dao_type is not None and dao_type.read_cols is not None:  # limit number of read cols
        df = pd.read_table(f, header=None, sep='\s+', usecols=range(dao_type.read_cols))
    else:
        df = pd.read_table(f, header=None, sep='\s+')

    #df.insert(0, 'id', df.index.to_series())
    if dao_type is None:
        dao_type = _guess_filetype(hdr, df)
    if dao_type == DAO.AP_FILE:  # two row per star format correction
        odd = df.iloc[0::2]
        odd.columns = DAO.AP_FILE_ODD.columns[:odd.columns.size]
        even = df.iloc[1::2]
        even.columns = DAO.AP_FILE_EVEN.columns[:even.columns.size]
        even.index = odd.index
        df = odd.join(even, rsuffix='foo')
    else:
        df.columns = dao_type.columns[:df.columns.size]


    df.id = df.id.astype(int)
    df.index = df.id

    # find NaN
    for col in df.columns:
        coltype = _get_col_type(dao_type.extension, col)
        if coltype.NaN:
            df[col].replace(coltype.NaN, pd.np.nan, inplace=True)

    ret = StarList(df)
    ret.DAO_type = dao_type
    return ret
Example #17
0
def get_mutation_data(gene_list, cancer_subtypes):
    """retrieve case-level data on mutations for given list 
    of genes and cell lines
    """
    
    base_url = 'http://www.cbioportal.org/webservice.do'

    genes = ' '.join(gene_list)
    subtypes = ' '.join(['%s_tcga_mutations' % c.lower()
                         for c in cancer_subtypes])
    parameters = {'cmd': 'getMutationData',
                  'gene_list': genes,
                  'genetic_profile_id': subtypes}

    r = requests.get(base_url, params=parameters)

    urlData = r.content
    error_message = 'Error: Problem when identifying'\
                    'a cancer study for the request.\n'
    if urlData == error_message:
        df = pd.read_table(io.StringIO(urlData.decode('utf-8')))
    else:
        df = pd.read_table(io.StringIO(urlData.decode('utf-8')), header=1)
        df = df[['gene_symbol', 'case_id',
                 'mutation_type', 'genetic_profile_id']]
        df = df[~((df.gene_symbol == 'Mutations') | (
            df.gene_symbol == 'gene_symbol'))]
        df = df.dropna()
    return df
Example #18
0
def get_hist_and_rec(station_number):
    #create "artificial" wildcard path for historical data. For every station imaginable. 
    histpath_temp = '/home/pythonproject/Weather/ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/historical/produkt_klima_Tageswerte_*'
    histpath_temp += str(station_number).zfill(5)+'.txt'
    #create "artificial" wildcard path for recent data. For the station we're looking at right now.       
    recpath_temp = '/home/pythonproject/Weather/ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/recent/produkt_klima_Tageswerte_*'
    recpath_temp += str(station_number).zfill(5)+'.txt'   

    #check if that path actually exists. Globglob checks if the histpath file actually exists.
    if len(glob.glob(histpath_temp)) != 0:
        #if file exists, save the path as a string to "histpath" variable.
        histpath = glob.glob(histpath_temp)[0]
        hist_ = pd.read_table(histpath, sep=";", low_memory=False)
        #is_hist = True

    else:
        #is_hist=False
        hist_ = []

        #check if recent data exists
    if len(glob.glob(recpath_temp)) != 0:
        recpath = glob.glob(recpath_temp)[0]
        rec_ = pd.read_table(recpath, sep=";", low_memory=False)
        #is_rec = True

    else:
        #is_rec = False
        rec_ = []
    return (hist_,rec_)
Example #19
0
def main():

    parser = argparse.ArgumentParser(description="Extract fasta file.")
    parser.add_argument('-DE',nargs=1,type=str,help="Table containing DE results.")
    parser.add_argument('-trinity',nargs=1,type=str,help="Trinity results.")
    parser.add_argument('-out',nargs=1,type=str,help="Out file.")
    args = parser.parse_args()


    #load tables

    DEResults = pandas.read_table(args.DE[0])
    trinityResults = pandas.read_table(args.trinity[0])


    #parse data

    temp = list(trinityResults['trans_derived'])
    temp = map(lambda x: x.split(':')[0],temp)
    trinityResults.index = temp


    filtered_trinityResults = trinityResults.ix[DEResults.index]
    topBlastHit = list(filtered_trinityResults['TopBlastHit'])

    uniprotID=[]

    for i in topBlastHit:
        if i is not '.':
            uniprotID.append(i.split('|')[1])

    uniprotID = pandas.DataFrame(uniprotID)
    uniprotID.to_csv(args.out[0],sep='\n',header=False,index=False)

    sys.exit(0)
Example #20
0
def test_pairwise():
    train_pool = Pool(ZEN_TRAIN_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TRAIN_PAIRS_FILE)
    test_pool = Pool(ZEN_TEST_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TEST_PAIRS_FILE)
    model = CatBoost(params={'loss_function': 'PairLogit', 'random_seed': 0, 'iterations': 2, 'thread_count': 8})
    model.fit(train_pool)
    pred1 = model.predict(test_pool)

    df = read_table(ZEN_TRAIN_FILE, delimiter='\t', header=None, dtype={12: str})
    train_target = df.loc[:, 1]
    cat_features = range(13)
    train_data = df.drop([0, 1, 15], axis=1).astype(str)
    train_pairs = read_table(ZEN_TRAIN_PAIRS_FILE, delimiter='\t', header=None)

    df = read_table(ZEN_TEST_FILE, delimiter='\t', header=None, dtype={12: str})
    test_data = df.drop([0, 1, 15], axis=1).astype(str)

    model.fit(train_data, train_target, cat_features, pairs=train_pairs)
    pred2 = model.predict(test_data)

    pairs_weight = np.ones(train_pairs.shape[0])
    model.fit(train_data, train_target, cat_features, pairs=train_pairs, pairs_weight=pairs_weight)
    pred3 = model.predict(test_data)

    assert _check_data(pred1, pred2)
    assert _check_data(pred1, pred3)
Example #21
0
def read_clinical_data(path, cancer):
    cancer = cancer.lower()
    na_vals = ['[Completed]', '[Not Available]', '[Not Applicable]', 'null']
    pat = pd.read_table(path + 'clinical_patient_{}.txt'.format(cancer),
                        index_col=0, skiprows=[0, 2], na_values=na_vals)
    f = pat.dropna(axis=1, how='all')
    for fu in os.listdir(path):
        if 'clinical_follow_up' not in fu:
            continue
        followup = pd.read_table(path + fu, index_col=0, skiprows=[0, 2],
                                 na_values=na_vals)
        f = pd.concat([f, followup])
    f.columns = f.columns.map(lambda s: s.replace('_', '').lower())
    
    time_vars = ['daystolastfollowup', 'daystolastknownalive',
                 'daystodeath']
    time_cols = list(f.columns.intersection(time_vars))
    
    # f['vitalstatus'] = f['vitalstatus'].map(lambda s: s in 
    #                                        ['DECEASED','Dead','deceased'], 
    #
    #                                   na_action='skip')
    f['vitalstatus'] = f['daystodeath'].isnull()
    
    f = f.sort(columns=['vitalstatus'] + time_cols, ascending=True)
    f = f.groupby(lambda s: s[:12], axis=0).last()
    return f
Example #22
0
def main(args):
    '''Everything is defined here'''
    outdir = args.outdir
    run = os.path.abspath(outdir).split('/')[-1].split('virmet_output_')[1]
    try:
        os.chdir(outdir)
    except FileNotFoundError:
        sys.exit('Where is the output dir? Check the path.')

    sample_dirs = glob.glob('*_S*')
    all_reads = pd.DataFrame()
    all_orgs = pd.DataFrame()
    for sd in sample_dirs:
        # parse and save stat files
        stat_file = os.path.join(sd, 'stats.tsv')
        df = pd.read_table(stat_file, sep='\t', header=None,
                           names=['category', 'reads'])
        df['sample'] = sd
        df['run'] = run
        all_reads = all_reads.append(df)
        # parse and save orgs_list files
        orgs_file = os.path.join(sd, 'orgs_list.tsv')
        df = pd.read_table(orgs_file, sep='\t', header=0)
        df['sample'] = sd
        df['run'] = run
        all_orgs = all_orgs.append(df)

    all_orgs.to_csv('orgs_species_found.tsv', sep='\t', index=False)
    all_reads.to_csv('run_reads_summary.tsv', sep='\t', index=False)
Example #23
0
def get_tickers ():
    global _tickers
    if _tickers is not None:
        return _tickers

    url_NSDQ = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download"
    url_NYSE = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download"
    nsdq = pd.read_table(url_NSDQ,sep=",")
    nyse = pd.read_table(url_NYSE,sep=",")
    tickers = pd.concat([nsdq,nyse])

    def dollar_to_int (dollar_string):
        try:
            parsed = int(float(dollar_string[1:-1])*1000)
            if dollar_string[-1] == 'B':
                parsed *= 1000
            return parsed
        except:
            return np.NaN
    tickers = tickers.drop_duplicates("Name")
    tickers = tickers[["Symbol","MarketCap","Sector","industry"]]
    tickers.MarketCap = tickers.MarketCap.apply(dollar_to_int)
    tickers = tickers[np.isfinite(tickers.MarketCap)]

    _tickers = tickers.reset_index()[["Symbol","MarketCap","Sector","industry"]]
    return _tickers
def main(args):
    clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0)
    taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "bla", "bla1", "bla2"])
    all_approved = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0)
    checkm_taxonomy = pd.read_table(args.checkm_taxonomy_file, index_col=0)

    all_approved_set = set(all_approved.index.values)
    unapproved_rrna = defaultdict(int)
    approved_rrna = {}
    levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    for rrna_contig in taxonomy_df.index.values:
        if rrna_contig in clustering.index:
            cluster_id = clustering.loc[rrna_contig]['cluster_id']
            if cluster_id in all_approved_set:
                checkm_val = checkm_taxonomy.loc[cluster_id]['Taxonomy'].split(';')
                metaxa_val = taxonomy_df.loc[rrna_contig]['taxonomy'].split(';')

                metaxa_val = fix_strange_metaxa_vals(metaxa_val)


                matched_level = None
                for i, level in enumerate(levels):
                    checkm_level_val, metaxa_level_val = None, None
                    if len(checkm_val) > i and len(metaxa_val) > i:
                        checkm_level_val = checkm_val[i][3:]
                        metaxa_level_val = metaxa_val[i]

                        if level == 'species':
                            metaxa_level_val = metaxa_val[i].replace(' ', '_')
                        if checkm_level_val == metaxa_level_val:
                            matched_level = i
                        else:
                            break
                    else:
                        matched_level = i-1
                        break
                if cluster_id not in approved_rrna:
                    approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0}

                if matched_level >= 3:
                    approved_rrna[cluster_id]['matching'] += 1
                else:
                    approved_rrna[cluster_id]['not matching'] += 1
                #print(most_detailed_level_checkm, most_detailed_level_metaxa)
                #print(most_detailed_matched_level)
                #print(taxonomy_df.loc[rrna_contig]['taxonomy'], checkm_taxonomy.loc[cluster_id]['Taxonomy'])
            else:
                unapproved_rrna[cluster_id] += 1

    for cluster_id in all_approved_set:
        if cluster_id not in approved_rrna:
            approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0}

    approved_stats_df = pd.DataFrame.from_dict(approved_rrna, orient='index')

    unapproved_stats_df = pd.DataFrame.from_dict(unapproved_rrna, orient='index')
    unapproved_stats_df.columns = ['nr_rrna']

    print(approved_stats_df)
    print(unapproved_stats_df)
Example #25
0
def threat_collect():
    # Includes Malc0de, emerging threats and Zeus Tracker as examples
    url_malc0de = 'http://malc0de.com/bl/IP_Blacklist.txt'
    url_et = 'http://rules.emergingthreats.net/blockrules/compromised-ips.txt'
    url_zeus = 'https://zeustracker.abuse.ch/blocklist.php?download=ipblocklist'
    url_zeus_domains = 'https://zeustracker.abuse.ch/blocklist.php?download=domainblocklist'

    # Convert to DataFrames
    df_malc0de = pd.read_table(url_malc0de, index_col=None, skiprows=4, header=None, names=['actor'])
    df_et = pd.read_table(url_et, index_col=None, skiprows=0, header=None, names=['actor'])
    df_zeus = pd.read_table(url_zeus, index_col=None, skiprows=6, header=None, names=['actor'])
    df_zeus_domains = pd.read_table(url_zeus_domains, index_col=None, skiprows=6, header=None, names=['actor'])

    # Alternatively, put a bunch of threat intel CSVs in the "intel" directory
    #
    # Read all threat intel from intel folder
    # intel_path ='intel'
    # all = glob.glob(intel_path + "/*.csv")
    # ti_combine = pd.DataFrame()
    # ti_list_ = []
    # for file_ in all:
    #     new_frame = pd.read_csv(file_,index_col=None, header=0, names=['actor'])
    #     ti_list_.append(new_frame)
    # ti_combine = pd.concat(ti_list_)

    # Combine dataframes
    ti_combine = pd.concat([df_malc0de, df_et, df_zeus, df_zeus_domains], axis=0)

    return ti_combine
Example #26
0
File: opus.py Project: smeylan/opus
def augmentOPUSfile(inputfile, mergefile, outputfile):	
	'''augment an OPUS files with additional annotations, e.g. adding a column with segmented Sampa from Lexique to the French data, 
	or segmented ''' 
	iff = pandas.read_table(inputfile, encoding='utf-8').dropna()
	mff = pandas.read_table(mergefile, encoding='utf-8').dropna()
	iff_m = iff.merge(mff, left_on="word", right_on="word")
	iff_m.to_table(off, encoding='utf-8') #!!! keep with the same format
    def mergeSingleExpressionTables(infile, outfile):
        '''
        Merge refcoding and lncRNA count tables from a single condition
        if there are separate input reference gtfs.
        '''

        file1 = infile[0]
        file2 = infile[1]

        tmpfile = P.getTempFilename(shared=True)

        df1 = pd.read_table(file1,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        df2 = pd.read_table(file2,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        out_frame = df1.append(df2)

        out_frame.to_csv(tmpfile, sep="\t")

        statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s'''

        P.run()
Example #28
0
def count_ddd_trios(families_path, trios_path, diagnosed_path):
    """ count the male and female probands in the complete DDD trios
    
    Args:
        families_path: path to DDD family relationships file, in ped format,
            containing proband IDs and sex information
        trios_path: path to table of probands in complete trios.
        diagnosed_path: path to table of probands with diagnoses
    
    Returns:
        tuple of male and female proband counts.
    """
    
    # load proband information, then select the proband who have exome sequence
    # available for both parents.
    families = pandas.read_table(families_path, sep="\t")
    trios = pandas.read_table(trios_path, sep="\t")
    proband_ids = trios["proband_stable_id"]
    probands = families[families["individual_id"].isin(proband_ids)]
    
    # get the number of trios studied in our data for each sex
    sex = probands["sex"].value_counts()
    male = sex[["M"]]
    female = sex[["F"]]
    
    if diagnosed_path is not None:
        # remove probands in DDD, unless we are not using the DDD probands.
        diagnosed = pandas.read_table(diagnosed_path, sep="\t")
        diagnosed = diagnosed[~diagnosed[["person_id", "sex"]].duplicated()]
        
        male -= sum(diagnosed["sex"].isin(["Male", "male", "M", "m"]))
        female -= sum(diagnosed["sex"].isin(["Female", "female", "F", "f"]))
    
    return (male, female)
Example #29
0
def merge_for_appended(app_rep_path):
    #Kind of hack out the STEM and OUTPUT_DIR
    STEM = os.path.basename(app_rep_path).split('_naive_report_Appended')[0]
    OUTPUT_DIR = app_rep_path.split('/reports/')[0]

    naive_path = os.path.join(OUTPUT_DIR,'reports',STEM+'_naive_report.txt')
    glm_path = os.path.join(OUTPUT_DIR,'reports','glmReports',STEM+'_FUSION_W_ANOM_AND_INDEL_JUNCPOUT')

    appended_dir = os.path.join(OUTPUT_DIR,'reports','AppendedReports')
    if not os.path.exists(appended_dir):
        os.mkdir(appended_dir)

    appended_path = os.path.join(appended_dir,STEM+'_naive_report_Appended.txt')
    naive = pd.read_table(naive_path,sep='\t')
    glm = pd.read_table(glm_path,sep='\t')
    appended = pd.read_table(appended_path,sep='\t')

    #Rename the first naive column to match the first glm column
    naive.rename(columns={'@Junction':'junction'}, inplace=True)

    #Merge the two on their only shared column
    merged = pd.merge(naive,glm)

    #os.rename(app_rep_path,app_rep_path+'.old')
    out_path = STEM+'.txt.appended'
    merged.to_csv(out_path,sep='\t',index=False)
    return out_path
Example #30
0
def clean_import_scsnv():
    """read each chr 1-22 and X/Y from the dbscSNV download into a dict of dataframes 
    for further processing"""

    chrom_dict = {}
    cols = [0, 1, 2, 3, 8, 16, 17]
    col_names = ["chr", "hg19_pos", "ref", "alt", "RefSeq_region", "ada_score", "rf_score"]

    for i in range(23):
        if i > 0:
            chrom_dict[str(i)] = pd.read_table(
                "dbscSNV1.1.chr" + str(i), sep="\t", na_values=".", usecols=cols, names=col_names, header=0
            )

    chrom_dict.setdefault(
        "X", pd.read_table("dbscSNV1.1.chrX", sep="\t", na_values=".", usecols=cols, names=col_names, header=0)
    )

    chrom_dict.setdefault(
        "Y", pd.read_table("dbscSNV1.1.chrY", sep="\t", na_values=".", usecols=cols, names=col_names, header=0)
    )

    # for i in range(23):
    #    if i > 0:
    #        chrom_dict[str(i)]  = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t',
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)
    #
    # chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t',
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000))
    #
    # chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t',
    #            na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000))

    return chrom_dict
Example #31
0
import pandas as pd
import numpy as np

y_combined = pd.read_table('CancerTypes_y.txt', sep='\t', header=None)
x_combined = pd.read_csv('Combined_processed.csv', header=0)

from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt

#Split data into training and test sets
x_combined_train, x_combined_test, y_combined_train, y_combined_test = train_test_split(
    x_combined, y_combined.values.flatten(), test_size=0.25, random_state=0)

model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_combined_train.values, y_combined_train, epochs=20, batch_size=128)

print(model.evaluate(x_combined_test, y_combined_test))

print(model.metrics_names)
def rating_stantistics(inputpath, outputpath):
    cm_data_raw = pd.read_table(inputpath, sep=',', encoding='utf-8')
    #对user action data 进行统计处理。

    newCust = pd.DataFrame(columns=[
        "userid", 'totalrate', 'totalnumber', 'averate', 'numberof1',
        'numberof2', 'numberof3', 'numberof367', 'numberof433', 'numberof4',
        'numberof5', 'lowrate', 'highrate'
    ])
    idlist = []

    for i in range(0, len(cm_data_raw)):
        record = cm_data_raw.iloc[i]
        uid = record['userid']

        if uid not in idlist:
            idlist.append(uid)
            udata = cm_data_raw[cm_data_raw['userid'] == uid]
            udata = sort(udata, ["orderid"], ascending=False)

            #记录总分数
            totalrate = 0
            totalnumber = 0
            averate = 0

            numberof5 = 0
            numberof1 = 0
            numberof2 = 0
            numberof3 = 0
            numberof367 = 0
            numberof433 = 0
            numberof4 = 0
            #3分及其以下为low rate,求其数量
            lowrate = 0
            #4分及其以上为高分
            highrate = 0

            #对一个用户评分信息进行统计
            for j in range(0, len(udata)):

                oudata = udata.iloc[j]
                totalrate = totalrate + oudata['rating']
                totalnumber = totalnumber + 1

                if j == 0:
                    nr = oudata['rating']

                #统计用户评分
                if oudata['rating'] == 1:
                    numberof1 = numberof1 + 1
                    lowrate = lowrate + 1

                if oudata['rating'] == 2:
                    numberof2 = numberof2 + 1
                    lowrate = lowrate + 1
                if oudata['rating'] == 3:
                    numberof3 = numberof3 + 1
                    lowrate = lowrate + 1
                if oudata['rating'] == 3.67:
                    numberof367 = numberof367 + 1
                if oudata['rating'] == 4.33:
                    numberof367 = numberof433 + 1

                if oudata['rating'] == 4:
                    numberof4 = numberof4 + 1

                if oudata['rating'] == 5:
                    numberof5 = numberof5 + 1

            averate = totalrate / totalnumber
            lowrate = numberof1 + numberof2 + numberof3
            highrate = numberof4 + numberof5 + numberof433

            finalud = {
                "userid": uid,
                'totalrate': totalrate,
                'totalnumber': totalnumber,
                'averate': averate,
                'numberof1': numberof1,
                'numberof2': numberof2,
                'numberof3': numberof3,
                'numberof367': numberof367,
                'numberof433': numberof433,
                'numberof4': numberof4,
                'numberof5': numberof5,
                'lowrate': lowrate,
                'highrate': highrate
            }

            newCust = newCust.append(finalud, ignore_index=True)

    newCust.to_csv(outputpath)

    return
      detail['dishes_name'].describe())



###############################################################################
#######################            任务实现             #######################
###############################################################################

# 代码 4-38
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('mysql+pymysql://root:[email protected]:\
3306/testdb?charset=utf8')
detail = pd.read_sql_table('meal_order_detail1',
      con = engine)
order = pd.read_table('../data/meal_order_info.csv',
      sep = ',',encoding = 'gbk')
user = pd.read_excel('../data/users.xlsx')
print('订单详情表的维度为:', detail.ndim)
print('订单信息表的维度为:', order.ndim)
print('客户信息表的维度为:', user.ndim)

print('订单详情表的形状为:', detail.shape)
print('订单信息表的形状为:', order.shape)
print('客户信息表的形状为:', user.shape)

print('订单详情表的元素个数为:', detail.size)
print('订单信息表的元素个数为:', order.size)
print('客户信息表的元素个数为:', user.size)


# 代码 4-39
Example #34
0
def read_table(*args, **kwargs):
    return pd.read_table(*args, **kwargs)
Example #35
0
# %%
# ** MODIFY **
# Set the file name and path to where you have stored the data
filename = 'streamflow_week4.txt'
filepath = os.path.join('data', filename)
print(os.getcwd())
print(filepath)

# %%
# DON'T change this part -- this creates the lists you 
# should use for the rest of the assignment
# no need to worry about how this is being done now we will cover
# this in later sections. 
#Read the data into a pandas dataframe
data=pd.read_table(filepath, sep = '\t', skiprows=30,
        names=['agency_cd', 'site_no', 'datetime', 'flow', 'code']
        )
# Expand the dates to year month day
data[["year", "month", "day"]] =data["datetime"].str.split("-", expand=True)
data['year'] = data['year'].astype(int)
data['month'] = data['month'].astype(int)
data['day'] = data['day'].astype(int)
# Make a numpy array of this data
flow_data = data[['year', 'month','day', 'flow']].to_numpy()
# Getting rid of the pandas dataframe since we wont be using it this week
del(data)



# Jill Question Answering Code
#%%
Example #36
0
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd
from scipy.spatial import KDTree

from scipy.stats import gaussian_kde
from scipy.interpolate import Rbf

#x=np.loadtxt('', usecols=(0))
#y=np.loadtxt('', usecols=(1))
#z=np.loadtxt('', usecols=(2))
df_points = pd.read_table("wdata.dat",
                          sep="\s+",
                          usecols=[0, 1, 3],
                          header=None)
df_points.columns = ['vx', 'vy', 'Nex']

levels = MaxNLocator(nbins=15).tick_values(df_points.Nex.min(),
                                           df_points.Nex.max())

# pick the desired colormap, sensible levels, and define a normalization
# instance which takes data values and translates those into levels.
cmap = plt.get_cmap('RdBu')
#cmap = plt.get_cmap('seismic')
normal = BoundaryNorm(levels, ncolors=cmap.N, clip=True)

GSIZE = 1000
X, Y = np.mgrid[df_points.vx.min():df_points.vx.max():GSIZE * 1j,
                df_points.vy.min():df_points.vy.max():GSIZE * 1j]
df_input = pd.read_excel(input_file)

#Storing UniProt as an easier to type variable 'u'
u = UniProt()

#Using a built-in UniProt method to create a data frame containg everything in UniProt
df_uniprot = u.get_df("organism:9606+and+reviewed:yes")

#Rename the common column to match the inputted column
df_uniprot.rename(columns={'Gene names  (primary )':'Gene Symbol', 'Entry':'UniProt Symbol',
                            'Proteomes':'Chromosome Number', 'Length':'Protein Length'}, inplace = True)

#Selecting columns I think are interesting
df1_uniprot = df_uniprot[['UniProt Symbol', 'Gene names', 'Gene Symbol', 'Protein names',
        'Chromosome Number', 'Sequence', 'Protein Length', 'Function [CC]', 'Gene ontology (GO)',
        'Gene ontology (biological process)', 'Gene ontology (molecular function)',
        'Gene ontology (cellular component)', 'Protein families']]

#Converting the NCBI gene list to a pandas data frame
df_ncbi = pd.read_table('NCBI_GeneID_File.txt')

#Merge the data frames on the UniProt
df_merged = df_input.merge(df1_uniprot, how='outer', on = 'Gene Symbol')
df_merged2 = df_merged.merge(df_ncbi, how = 'outer', on = 'Gene Symbol')


#Writing the data frame to an Excel file
out_file = pd.ExcelWriter('CRISPR_Uniprot_NCBI_DataFrame3.xlsx', engine = 'xlsxwriter')
df_merged2.to_excel(out_file)
out_file.close()
Example #38
0
def actual_data():
    with open("data/actual_data.dat") as data:
      read_data = pd.read_table(data)
      return read_data
Example #39
0
def invalid_data():
    with open("data/invalid_data.dat") as data:
      read_data = pd.read_table(data)
      return read_data
Example #40
0
parser.add_argument('--model_file',
                    '-mf',
                    type=str,
                    default='stupidvae.pkl',
                    help='Save model filename')
parser.add_argument('--init_stdev',
                    '-sd',
                    type=float,
                    default=0.01,
                    help='Weight init stdev')
args = parser.parse_args()

expn_pth = '/n/data_02/Basset/data/expn/roadmap/57epigenomes.RPKM.pc'
print("Reading gene expression data from:\n{}".format(expn_pth))
# Gene expression dataset
expn = pd.read_table(expn_pth, header=0)
col_names = expn.columns.values[1:]
expn = expn.drop(col_names[-1],
                 axis=1)  # 19795*57 right now # TODO: is this all right?
expn.columns = col_names
pinned_lookup = torch.nn.Embedding.from_pretrained(torch.FloatTensor(
    expn.as_matrix().T[1:]),
                                                   freeze=True)  # [1:] is new!
pinned_lookup.cuda()

torch.manual_seed(3435)
imgs = torch.poisson(pinned_lookup.weight)  # discretize data
# imgs = pinned_lookup.weight.round()
# imgs = pinned_lookup.weight
dat = torch.utils.data.TensorDataset(imgs, torch.zeros(
    56, 1))  # placeholder arg required pytorch <0.4.0...
Example #41
0
    'peso': 'float64',
    'duration': 'O',
    'carteira_a_mercado': 'O',
    'numero_operacoes': 'float64',
    'quant_negociada_titulos': 'float64',
    'valor_negociado': 'float64',
    'pmr': 'O',
    'convexidade': 'float64',
    'yield': 'float64',
    'redemption_yield': 'float64'
}

nomes_validos = list(valid_dtypes.keys())

# lista de user agents
uas = pd.read_table('input/user-agents.txt',names=['ua'],skiprows=4,squeeze=True)
# lista de feriados anbima
fer = pd.read_excel('input/feriados_nacionais.xls',skipfooter=9, usecols=['Data'], parse_dates=['Data'], squeeze=True)
bday = pd.offsets.CDay(holidays=fer)



def get_indices_anbima(dt, wait=True):
    """
    dt: str '%d/%m/%Y' ou dt obj
    """
    if wait:
        if isinstance(wait,bool): wait = random.randint(1,3)
        sleep(wait)
    
    headers = {"User-Agent": np.random.choice(uas)}
Example #42
0
def control_data():
    with open("data/control_data.dat") as data:
      read_data = pd.read_table(data)
      return read_data
Example #43
0
    def load(cls, path, prefix, network=None):
        r"""
        Load data from the \'dat\' files located in specified folder.

        Parameters
        ----------
        path : string
            The full path to the folder containing the set of \'dat\' files.

        prefix : string
            The file name prefix on each file. The data files are stored
            as \<prefix\>_node1.dat.

        network : OpenPNM Network Object
            If given then the data will be loaded on it and returned.  If not
            given, a Network will be created and returned.

        Returns
        -------
        An OpenPNM Project containing a GenericNetwork holding all the data

        """
        net = {}

        # ---------------------------------------------------------------------
        # Parse the link1 file
        path = Path(path)
        filename = Path(path.resolve(), prefix+'_link1.dat')
        with open(filename, mode='r') as f:
            link1 = read_table(filepath_or_buffer=f,
                                  header=None,
                                  skiprows=1,
                                  sep=' ',
                                  skipinitialspace=True,
                                  index_col=0)
        link1.columns = ['throat.pore1', 'throat.pore2', 'throat.radius',
                         'throat.shape_factor', 'throat.total_length']
        # Add link1 props to net
        net['throat.conns'] = sp.vstack((link1['throat.pore1']-1,
                                         link1['throat.pore2']-1)).T
        net['throat.conns'] = sp.sort(net['throat.conns'], axis=1)
        net['throat.radius'] = sp.array(link1['throat.radius'])
        net['throat.shape_factor'] = sp.array(link1['throat.shape_factor'])
        net['throat.total_length'] = sp.array(link1['throat.total_length'])
        # ---------------------------------------------------------------------
        filename = Path(path.resolve(), prefix+'_link2.dat')
        with open(filename, mode='r') as f:
            link2 = read_table(filepath_or_buffer=f,
                                  header=None,
                                  sep=' ',
                                  skipinitialspace=True,
                                  index_col=0)
        link2.columns = ['throat.pore1', 'throat.pore2',
                         'throat.pore1_length', 'throat.pore2_length',
                         'throat.length', 'throat.volume',
                         'throat.clay_volume']
        # Add link2 props to net
        cl_t = sp.array(link2['throat.length'])
        net['throat.length'] = cl_t
        net['throat.conduit_lengths.throat'] = cl_t
        net['throat.volume'] = sp.array(link2['throat.volume'])
        cl_p1 = sp.array(link2['throat.pore1_length'])
        net['throat.conduit_lengths.pore1'] = cl_p1
        cl_p2 = sp.array(link2['throat.pore2_length'])
        net['throat.conduit_lengths.pore2'] = cl_p2
        net['throat.clay_volume'] = sp.array(link2['throat.clay_volume'])
        # ---------------------------------------------------------------------
        # Parse the node1 file
        filename = Path(path.resolve(), prefix+'_node1.dat')
        with open(filename, mode='r') as f:
            row_0 = f.readline().split()
            num_lines = int(row_0[0])
            array = sp.ndarray([num_lines, 6])
            for i in range(num_lines):
                row = f.readline()\
                       .replace('\t', ' ').replace('\n', ' ').split()
                array[i, :] = row[0:6]
        node1 = DataFrame(array[:, [1, 2, 3, 4]])
        node1.columns = ['pore.x_coord', 'pore.y_coord', 'pore.z_coord',
                         'pore.coordination_number']
        # Add node1 props to net
        net['pore.coords'] = sp.vstack((node1['pore.x_coord'],
                                        node1['pore.y_coord'],
                                        node1['pore.z_coord'])).T
        # ---------------------------------------------------------------------
        # Parse the node1 file
        filename = Path(path.resolve(), prefix+'_node2.dat')
        with open(filename, mode='r') as f:
            node2 = read_table(filepath_or_buffer=f,
                                  header=None,
                                  sep=' ',
                                  skipinitialspace=True,
                                  index_col=0)
        node2.columns = ['pore.volume', 'pore.radius', 'pore.shape_factor',
                         'pore.clay_volume']
        # Add node2 props to net
        net['pore.volume'] = sp.array(node2['pore.volume'])
        net['pore.radius'] = sp.array(node2['pore.radius'])
        net['pore.shape_factor'] = sp.array(node2['pore.shape_factor'])
        net['pore.clay_volume'] = sp.array(node2['pore.clay_volume'])
        net['throat.area'] = ((net['throat.radius']**2) /
                              (4.0*net['throat.shape_factor']))
        net['pore.area'] = ((net['pore.radius']**2) /
                            (4.0*net['pore.shape_factor']))

        if network is None:
            network = GenericNetwork()
        network = cls._update_network(network=network, net=net)

        # Use OpenPNM Tools to clean up network
        # Trim throats connected to 'inlet' or 'outlet' reservoirs
        trim1 = sp.where(sp.any(net['throat.conns'] == -1, axis=1))[0]
        # Apply 'outlet' label to these pores
        outlets = network['throat.conns'][trim1, 1]
        network['pore.outlets'] = False
        network['pore.outlets'][outlets] = True
        trim2 = sp.where(sp.any(net['throat.conns'] == -2, axis=1))[0]
        # Apply 'inlet' label to these pores
        inlets = network['throat.conns'][trim2, 1]
        network['pore.inlets'] = False
        network['pore.inlets'][inlets] = True
        # Now trim the throats
        to_trim = sp.hstack([trim1, trim2])
        trim(network=network, throats=to_trim)

        return network.project
Example #44
0
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Z = pd.read_table("../hotspot/hotspot_pairs_z.txt.gz", index_col=0)
hs_results = pd.read_table("../hotspot/hotspot.txt", index_col=0)

# Cluster things!

# to_drop = ['mt-Rnr1', 'mt-Rnr2']
# Z = Z.drop(to_drop, axis=1).drop(to_drop, axis=0)

sns.clustermap(Z,
               vmin=-2,
               vmax=2,
               metric='correlation',
               yticklabels=True,
               method='average')
plt.show()

# Now cluster and divide

from scipy.cluster.hierarchy import linkage, fcluster, dendrogram


def sort_clusters(cl):
    map_fun = {
        old_i: new_i + 1
        for new_i, old_i in enumerate(cl.value_counts().index)
    }
    cl = cl.map(map_fun)
Example #45
0
# BUILD STATION DATABASE
stas = ",".join([sta._code for sta in inventory[0].stations])
stdb_out = BNG_out+'sta_list'
!{path2envbin+'query_fdsn_stdb.py'} -N {network} -C {compstr} -S {stas} {stdb_out}

# %% codecell
# Perform BNG analysis
stdb_pkl = stdb_out+'.pkl'
!{path2envbin+'bng_calc_auto'} --times=-5.,15. --window=60. --bp=0.04,0.1 --min-mag={minmagnitude} --min-dist={mindist} --save-location {BNG_out} {stdb_pkl}

# %% codecell
# Plot BNG output and save
!{path2envbin+'bng_average'} --load-location {BNG_out} --plot --save {stdb_pkl}
# !{path2envbin+'bng_average'} --load-location {BNG_out} {stdb_pkl}

# %% codecell
# Combine all measurements into single file
pathlist = sorted(Path(BNG_out).glob('*/orientation_bng.txt'))
file = open(BNG_out+'/orientations_BNG.txt', 'w')
file.write("%8s %10s %10s %5s\n" % ('sta', 'phi', 'err', 'num'))
for path in pathlist:
    data = pd.read_table(path, delim_whitespace=True)
    sta = data.sta[0]
    phi = data.phi[0]
    err = data.err[0]
    num = data.num[0]
    file.write("%8s %10f %10f %5d\n" % (sta, phi, err, num))
file.close()
    

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


# ### 各遺伝子について、exon領域を “start-end,start-end,…”と出力する処理を実装。NIPBLで例示。

# In[2]:


UCSC = pd.read_table("refFlat.hg38.txt", skiprows=1, names=('geneName', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds'))
UCSC
# そのまま読み込むと、はじめのカラムが#geneNameという名前になってしまう。
# そこで、header部分は読み込まず、namesを使って自分で付けることにした。


# In[3]:


# "geneName" 列の値が NIPBLと一致する行をdfとして取得。
# .copy()にしないと、attribute errorとなる。
NIPBL_UCSC_df = UCSC.query("geneName == 'NIPBL'").copy()
NIPBL_UCSC_df

# cdsEndの異なる二種類が表示される
Example #47
0
def run_model(src, dst, clinopyroxene=False):

    with app.app_context():
        data = sample_ree(normalized=True,
                          mode='cpx' if clinopyroxene else 'whole_rock')
        colors = sample_colors()

    model = DepletionModel(src)

    if clinopyroxene:
        depleted = model.fit_HREE(data, table='clinopyroxene_0 trace')
    else:
        depleted = model.fit_HREE(data)

    enrichment, multiplier = model.enrichment(data, depleted)

    # Create primitive-mantle normalized dataset
    Sun_PM = get_melts_data('literature/Sun_McDonough_PM.melts')
    PM_trace = Sun_PM.trace.ix[:, 0]

    # Add NMORB
    NMORB = get_melts_data('literature/NMORB_trace.melts')
    NMORB_trace = ree_only(NMORB.trace.transpose() / PM_trace)

    # Alkali basalt
    alkali = read_table('literature/Farmer_1995-Alkali-basalt.txt',
                        comment="#",
                        index_col=0)
    alkali /= PM_trace
    alkali_trace = ree_only(alkali)

    vals = [element(i) for i in data.columns]
    d = ree_only(depleted)

    grid = dict(height_ratios=(4.5, 1), hspace=0.1, right=0.99, left=0.16)
    fig, (ax1, ax2) = subplots(2, 1, figsize=(3.5, 6), gridspec_kw=grid)

    def create_main_axis(ax):
        for i, row in d.iterrows():
            c = colors.ix[row.name][0]

            # Plot real data
            series = data.ix[row.name]
            u = series.map(lambda x: x.n)
            s = series.map(lambda x: x.s)
            ax.fill_between(vals,
                            u - s,
                            u + s,
                            facecolor=c,
                            edgecolor='none',
                            alpha=0.2)

            def plot(name, x, y, **kwargs):
                if i == 'CK-3':
                    kwargs['label'] = name
                else:
                    kwargs['label'] = ""
                p = ax.plot(x, y, color=c, **kwargs)

            if clinopyroxene:
                s = 'clinopyroxene'
            else:
                s = 'whole-rock'
            plot('Measured ' + s, vals, u)

            # Plot calculated best fit
            plot("Modeled depleted",
                 d.columns,
                 row,
                 linestyle='--',
                 linewidth=1)

            v = enrichment.ix[row.name]
            if i == 'CK-2':
                # Don't include CK-2 because it isn't depleted, so results are spurious.
                continue
            plot("Enriching melt", d.columns, v, linestyle=':', linewidth=1)

        # Plot NMORB
        ax.fill_between(NMORB_trace.columns,
                        NMORB_trace.ix[0, :],
                        NMORB_trace.ix[0, :] - 0.5,
                        color='#bbbbbb',
                        linewidth=1.5,
                        zorder=-5,
                        label="")

        ax.fill_between(alkali_trace.columns,
                        alkali_trace.min(),
                        alkali_trace.max(),
                        facecolor='#dddddd',
                        edgecolor='none',
                        zorder=-10,
                        label="")

        ax.set_ylim(.01, 100)
        ax.set_xlim(element('La') - 0.1, element('Lu'))
        ax.yaxis.set_ticklabels(
            ["{:g}".format(v) for v in ax.yaxis.get_ticklocs()])
        ax.set_ylabel("Rare-earth element abundance / Primitive Mantle")
        ax.xaxis.set_ticks(vals)
        ax.xaxis.set_ticklabels(data.columns)
        ax.set_yscale('log')
        ax.text(element('Ce') - 0.5,
                40,
                "Alkali basalt",
                rotation=-28,
                color='#888888')
        ax.text(element('La'), 5, "NMORB", rotation=15, color='#888888')
        legend = ax.legend(loc="upper right")
        fr = legend.get_frame()
        fr.set_lw(0.5)

    create_main_axis(ax1)
    update_axes(ax1)

    fig.subplots_adjust(top=0.99, right=0.99)
    ree_scatter(ax2, model, data, colors)
    ax2.set_ylim([0, 1.2])
    ax2.set_xlabel(r'HREE depletion degrees (%)')
    ax2.set_ylabel("Enriching fluid\nassimilated (%)")
    ax2.yaxis.set_label_coords(-0.1, 0.22)
    update_axes(ax2)
    axis_labels(ax1, ax2, pad=.16, fontsize=14)

    fig.savefig(dst, bbox_inches='tight')
Example #48
0
import pandas as pd
import os, sys

if len(sys.argv) != 2:
    print('Error: No task specified')
    print('e.g. separate-files 2_back_vs_0_back')
    sys.exit(1)

if not os.path.exists('subjects'):
    os.makedirs('subjects')

for run in {'1', '2'}:
    subjectFile = 'taskBOLD_{0}_run_{1}-rh.csv'.format(sys.argv[1], run)

    print('Reading {0}'.format(subjectFile))
    df = pd.read_table(subjectFile, header=0, sep=',', index_col=0)

    nSubjects = len(df.index)
    print('{0} subjects'.format(nSubjects))

    f = open('subjects-{0}.txt'.format(run), 'w')

    for index in range(0, nSubjects):
        subjectID = 'NDAR_' + df.iloc[index, 0][5:16]
        outFile = 'subjects/' + subjectID + '-' + run + '.dscalar.nii'

        cmd = 'wb_command -cifti-merge {0} -cifti {1}_run{2}_sm5.dscalar.nii -column {3}'.format(
            outFile, sys.argv[1], run, index + 1)
        print(cmd)
        os.system(cmd)
Example #49
0
    columnsToEncode = list(df.select_dtypes(include=['category', 'object']))
    le = LabelEncoder()
    for feature in columnsToEncode:
        try:
            df[feature] = le.fit_transform(df[feature])
        except:
            print('Error encoding ' + feature)
    return df


df = pd.read_table(
    "german.data",
    header=None,
    sep=' ',
    names=[
        'chkngAcctStatus', 'durationMonths', 'creditHistory', 'loanPurpose',
        'creditAmount', 'savingsTotal', 'crrntEmplmtSince', 'instllmtPct',
        'persnlStatus', 'othrDebtorGuaranters', 'crrntResidenceSince',
        'propertyType', 'age', 'otherInstllmtType', 'housingType',
        'existingCredits', 'jobStatus', 'numDependents', 'registeredPhone',
        'foriegnWorker', 'goodBad'
    ])

df['goodBad'] = df["goodBad"] - 1

dfPred = encodeFeatures(df)

predictors = dfPred.drop('goodBad', 1)

targets = df['goodBad']

np.random.seed(123)
Example #50
0
# 计算MovieLens 100k数据集中男性女性用户评分的标准差并输出
import pandas as pd
unames = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('ml-100k/u.user',
                      sep='|',
                      header=None,
                      names=unames,
                      engine='python')
rnames = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-100k/u.data',
                        sep='\t',
                        header=None,
                        names=rnames,
                        engine='python')
data = pd.merge(users, ratings)

mean_ratings = data.pivot_table('rating',
                                index=['user_id', 'gender'],
                                aggfunc='mean')
std = mean_ratings.groupby('gender').std()
print(std)

# mean_ratings = data.pivot_table('rating',index='user_id',columns = 'gender',aggfunc='mean')
# female_ratings = mean_ratings['F']
# female_ratings_std = female_ratings.std()
#
# male_ratings = mean_ratings['M']
# male_ratings_std = male_ratings.std()
#
# print('Gender')
# print('M %.2f' % male_ratings_std)
Example #51
0
 def read_table(self, *args, **kwargs):
     kwargs = self.update_kwargs(kwargs)
     return read_table(*args, **kwargs)
Example #52
0
import pandas as pd

data_frame = pd.read_table("popular-names.txt", header=None)
print(data_frame.sort_values(2, ascending=False))
# Import data_processing
from numpy_regressor.data_processing import DataProcessing

# Import Pandas
import pandas as pd

# Import Bokeh
import bokeh
from bokeh.plotting import figure, show
from bokeh.palettes import d3

# Getting the data from uci data repo
airfoil_df = pd.read_table(
    filepath_or_buffer=
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    names=[
        "Frequency", "Angle of attack", "Chord length", "Free-stream velocity",
        "Suction side displacement", "Scaled sound pressure"
    ])

# Creating class objects
airfoil_regressor = Regression()
airfoil_data_process = DataProcessing()

# Splitting up train and test set
airfoil_df_train, airfoil_df_test = airfoil_data_process.train_test_split(
    airfoil_df)

# Calling tbe regression function to get the prediction
prediction = airfoil_regressor.my_regression(airfoil_df_train,
                                             airfoil_df_test.iloc[:, 0:-1], 1)
Example #54
0
import scipy.sparse as sp
import numpy as np
import itertools as it
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean
from time import time, ctime
from math import ceil
import warnings
warnings.filterwarnings("ignore")

data_path = "~/Desktop/"
add_path = "C:\\Users\\Kyle\\OneDrive\\Documents\\GMU Classes\\CS 584\\HW4_Jackson_Truong\\data\\"

#reads in all the data as matrices, we only use test, train, and movie_tag, though
test_array = pd.read_table(add_path+"test.dat", skip_blank_lines=False, \
                     delim_whitespace=True).as_matrix()
train_array = pd.read_table(add_path+"train.dat", skip_blank_lines=False, \
                      delim_whitespace=True).as_matrix()
genre_array = pd.read_table(add_path+"movie_genres.dat", skip_blank_lines=False, \
                      delim_whitespace=True).as_matrix()
movie_tag_array = pd.read_table(add_path+"movie_tags.dat", \
                           skip_blank_lines=False).as_matrix()
actor_array = pd.read_table(add_path+"movie_actors.dat", \
                           skip_blank_lines=False).as_matrix()
actor_array = np.delete(actor_array, 2, 1)
director_array = pd.read_table(add_path+"movie_directors.dat", \
                           skip_blank_lines=False).as_matrix()[:,0:2]

#%%

#this section forms dicts to reindex the various IDs, to reduce dimensionality
Example #55
0
def validateFacilityData(facility, src):
    ocfg = FACMETADATA[facility]
    oopts = ocfg.get('options', {})
    enc, sep = oopts.get('enc', DEF_ENCODING), oopts.get('sep', DEF_SEP)
    #if not osp.exists(src):
    #    raise FileNotFoundError('input file %s not found - nothing to check' % src)
    try:
        df = pd.read_csv(src, encoding = enc, sep = sep)
    #except FileNotFoundError:      # we tested that already...
    #    raise FileNotFoundError('input file %s not found - nothing to check' % src)
    except:
        try:
            df = pd.read_table(src, encoding = enc, sep = sep, compression = 'infer')
        except:     raise IOError("Impossible to load source data - format not recognised")
    oindex = ocfg.get('index',{}).copy()
    nindex = [col.get('name') for col in oindex.values()]
    try:
        columns = set(list(df.columns)).difference(set(nindex))
        assert columns == set()
    except AssertionError:
        raise IOError("Unknown column present in the dataframe: '%s'" % list(columns))
    else:
        try:
            columns = set(list(nindex)).difference(set(df.columns))
            assert columns == set()
        except AssertionError:
            logging.warning("\n! Missing columns in source file: '%s' !" % list(columns))
    nindex = {col.get('name'): col for col in oindex.values()}
    for col in df.columns:
        # check missing values
        try:
            assert df[col].isnull().any() is np.bool_(False)
        except AssertionError:
            try:
                assert df[col].isnull().all() is np.bool_(False)
            except AssertionError:
                logging.warning("\n! Column '%s' empty - missing values only !" % col)
                continue
        else:
            # logging.warning("\n! No missing values in column '%s' !" % col)
            pass
        # check type
        dtype = nindex[col].get('type')
        if dtype == 'str':
            pass #
        elif dtype is not None:
            try:
                assert df[col].dtype==object or df[col].dtype in Type.pytname2npt(dtype) # and dtype != object
            except AssertionError:
                logging.warning("\n! Unexpected type '%s' for column '%s' !" % (df[col].dtype,col))
        # check values/format
        dfmt = values = nindex[col].get('values')
        if values is not None:
            # check values range
            if dtype == "datetime":
                # check date format
                try:
                    pd.to_datetime(df[col], format=dfmt, errors='coerce').notnull().all() is True
                except AssertionError:
                    logging.warning("\n! Unexpected date format for column '%s' !" % col)
            else:
                try:
                    values = [values,] if not isinstance(values, Sequence) else values
                    assert df[col].dropna().isin(values).all()
                except AssertionError:
                    raise IOError("Wrong input values in column '%s'" % col)
    # check id uniquiness
    try: # note the use of INDEX here, not nindex, though the names end up being
        # the same
        assert df[oindex.get('id',{})['name']].dropna().is_unique is True
    except AssertionError:
        raise IOError("Duplicated identifier IDs")
    # check geographical coordinates
    for lL in ['lat','lon']:
        col = oindex.get(lL,{})['name']
        if col in df.columns:
            try:
                assert (df[col]
                        .dropna()
                        .between(MINMAX_LL[lL][0],MINMAX_LL[lL][1])
                        .all()) is np.bool_(True)
            except AssertionError:
                raise IOError("Wrong input values for %s geographical coordinate '%s'" % lL)
Created on Mon Aug 19 22:07:40 2019

@author: USER
"""

# HEART dISEASE DIAGONOSIS

# Using ANN classifier

# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Preprocessing
dataset = pd.read_table('processed.cleveland.data', sep=',', header=None)

X = dataset.iloc[:, :-1]

y_class = dataset.iloc[:, -1]

y = [item > 0 for item in y_class]

# Replacing missing values with most frequent one
X[11].value_counts()
X[11] = X[11].map({'?': 0, '1.0': 1.0, '2.0': 2.0, '3.0': 3.0, '0.0': 0.0})
X[12] = X[12].map({'6.0': 6.0, '3.0': 3.0, '7.0': 7.0, '?': 3.0})

X = X.values

# Handling categorical Variables
Example #57
0
import sys

sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:,
                        samples].stack([0,
                                        1]).unstack().reset_index(1,
                                                                  drop=False)
sample_info = sample_info.rename(columns={"level_1": "sample"})

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()
cluster_disease = dict()
df = pd.read_csv('../module/NG_network_module_Q0.3200458.csv')
list1 = df.values.tolist()
for each in list1:
    disease, name, category, cluster = each
    if cluster not in cluster_disease:
        cluster_disease[cluster] = set()
    cluster_disease[cluster].add(disease)
    if disease == 'M07':
        joint_mascular_neur_spine_cluster = cluster
    if disease == 'K80':
        hepatobiliary_cluster = cluster

multimorbidity_pathway = dict()
df = pd.read_table('../overlap/multimorbidity_pathway.txt')
list1 = df.values.tolist()
for each in list1:
    multimorbidity_pathway[(each[0], each[1])] = set(each[4].split(';'))

print(
    '# ------------------ Joint-Muscular-Neurological-Spine -------------- #')
list1 = list()
for each in multimorbidity_pathway:
    if ('E66' in each) & (len(
            set(each) & cluster_disease[joint_mascular_neur_spine_cluster]) !=
                          0):
        list1 += list(multimorbidity_pathway[each])
for each in list1:
    print(each)
Example #59
0
def network(request):

    error_message = ""
    jump_div = ""

    # Option 1: List of Ensembl IDs
    if "option1" in request.POST:
        input_query = []
        for element in request.POST['input'].split('\n'):
            element = element.strip()
            if element:
                input_query.append(element)

        input_query = list(set(input_query))

        # max input IDs
        if 2000 > len(input_query) > 1:
            if input_query[0][0:4] == 'ENSG' or input_query[0][
                    0:4] == 'ENST' or input_query[0][0:4] == 'ENSP':
                job_num = str(random.randrange(500))
                with open(f'{jobs_path}/{job_num}.txt',
                          "wb") as fp:  # Pickling
                    pickle.dump(input_query, fp)
                return redirect(Multi_proteins, job=job_num)

    # Option 2: Upload file
    if "option2" in request.POST and 'gene-count-file' in request.FILES:
        error_message_suffix = ""

        try:
            # --- Check input file for correct format
            # Try to decode as UTF-8, sanitize and parse as table
            try:
                file_string = escape(
                    request.FILES['gene-count-file'].read().decode('UTF-8'))
                file_buffer = StringIO(file_string)
                # Parse as pandas dataframe
                transcript_count_df = pd.read_table(file_buffer)
            except UnicodeDecodeError:
                error_message_suffix = "could not be parsed as an text file"
                raise RuntimeError

            except ParserError:
                error_message_suffix = f"could not be parsed as an table file (CSV or TSV)"
                raise RuntimeError

            # Check input shape
            if transcript_count_df.shape[0] < 2 or transcript_count_df.shape[
                    1] < 2:
                error_message_suffix = f"could not be parsed as table or has less than two rows and columns"
                raise RuntimeError

            # Kevin: Zakaria please insert the magic down below:)
            # Zaka: And this is where the magic happens :p

            # Check if the first row corresponds to transcript Ensembl IDs
            if not (str(transcript_count_df.iloc[0, 0]).startswith('ENST')
                    or str(transcript_count_df.iloc[1, 0]).startswith('ENST')):
                error_message_suffix = f"must have Ensembl transcript IDs in the first column starting with \"ENST\""
                raise RuntimeError

            # --- Try parsing counts for the different options (search for FPKM, tpm or counts)
            # max_isoforms: the max number of isoforms to consider:
            max_isoforms = int(request.POST['transcript-count-max'])

            column_names = transcript_count_df.columns

            # Cufflinks file (or a similar thing)
            if "FPKM" in column_names:
                transcript_count_df = transcript_count_df.sort_values(
                    by=['FPKM'], ascending=False)
                cut_rows = transcript_count_df.iloc[:,
                                                    0].unique()[:max_isoforms]
                print('Input matches cufflinks output ')

            # Kallisto output counts in tpm
            elif "tpm" in column_names:
                transcript_count_df = transcript_count_df.sort_values(
                    by=['tpm'], ascending=False)
                cut_rows = transcript_count_df.iloc[:,
                                                    0].unique()[:max_isoforms]
                print('Input matches kallisto output ')

            # Generic count matrix
            elif "counts" in column_names:
                transcript_count_df = transcript_count_df.sort_values(
                    by=['counts'], ascending=False)
                cut_rows = transcript_count_df.iloc[:,
                                                    0].unique()[:max_isoforms]
                print('Input with counts column ')

            # Could not find the row
            else:
                error_message_suffix = "does not contain a column with the counts. The column must be named either \"FPKM\", \"tpm\" or \"counts\""
                raise RuntimeError

            # and let DIGGER do the magic ;)
            job_num = str(random.randrange(500))
            with open(f'{jobs_path}/{job_num}.txt', "wb") as fp:
                pickle.dump(cut_rows, fp)  # Pickling
                print(f"Starting network analysis with {len(cut_rows)} rows")
            return redirect(Multi_proteins, job=job_num)

        except RuntimeError:
            print("Could not parse uploaded file acorrectly")
            error_message = f"The uploaded file \"{request.FILES['gene-count-file']}\" {error_message_suffix}."
            jump_div = 'option2'

    return render(request,
                  'setup/network.html',
                  context={
                      'error_message': error_message,
                      'jump_div': jump_div
                  })
import argparse
import pandas as pd
import sys

parser = argparse.ArgumentParser()
parser.add_argument("-i",
                    "--input",
                    type=str,
                    required=True,
                    help="Input CCF file")
parser.add_argument("-b",
                    "--barcodes",
                    type=str,
                    required=True,
                    help="List of barcode (one per line)")
parser.add_argument("-o",
                    "--output",
                    type=str,
                    required=True,
                    help="Output CCF file")
args = parser.parse_args()

if __name__ == "__main__":
    ccf = pd.read_table(args.input, header=None)
    with open(args.barcodes, 'r') as f:
        barcodes = set([line.strip() for line in f])
    filtered = ccf[ccf[5].isin(barcodes)]
    filtered.to_csv(args.output, sep='\t', header=False, index=False)