Ejemplo n.º 1
0
def typeI_table(n1, n2, ncases, path=None):
    """Return a table of the m-test statistics under the null hypothesis.

    The function returns a table containing the value of the
    m-statistics of `ncases` draws from two populations of size `n1`
    and `n2` under the null hypothesis that the mean of the two
    populations is the same.

    If a table for population sizes `n1` and `n2` with more entries than
    `ncases` exists, all the stored values are returned.
    Otherwise, new cases are computed and stored, then returned.

    Parameters
    ----------
    n1 : number of samples in population 1
    n2 : number of samples in population 2
    ncases : number of populations to generate
    path : path to the m-test tables (see `get_tables_path`)

    Returns
    -------
    test_values : 1D array of m-test statistics, containing *at least*
                  `ncases` elements, but possibly more
    """

    fname = os.path.join(get_tables_path(path), TABLESNAME%(n1,n2))
    if os.path.exists(fname):
        logging.debug('Loading type I table %s', fname)
        npzfile = sp.load(fname)
        test_values = npzfile['test_values'].flatten()
    else:
        test_values = sp.array([])
    
    nvalues = test_values.shape[0]
    if nvalues>=ncases:
        return test_values

    nmissing = ncases-nvalues
    
    # compute missing entries
    if nmissing > 0:
        logging.debug('Requested %d cases, found %d, missing %d',
                      ncases, nvalues, nmissing)
        print 'The requested mtest table is incomplete.'
        print ('Need to process %d additional cases, this may take some time.'
               % nmissing)
        
        missing_values = sp.zeros((nmissing,))
        pop1_test, pop2_test = _random_same_mean(n1, n2, nmissing)
        
        for i in progressinfo(range(nmissing), style='timer'):
            missing_values[i] = mtest_marginal_likelihood_ratio(pop1_test[i,:],
                                                                pop2_test[i,:],
                                                                nprior=_NPRIOR)

        # update and save table
        test_values = sp.concatenate((test_values, missing_values))
        logging.debug('Saving updated table %s', fname)
        sp.savez(fname, test_values=test_values)

    return test_values
Ejemplo n.º 2
0
def typeII_table(n1, n2, ncases, mean, std, path=None):
    """Return a table of the m-test statistics under a specific hypothesis.

    The function returns a table containing the value of the
    m-statistics and (for comparison) the t-statistics (independent
    t-test) of `ncases` draws from two populations of size `n1` and
    `n2`, the first with distribution Normal(mean, std^2), and the
    second with distribution Normal(0, 1).

    The table is used to compute the power of the test under different
    conditions.
    
    If a table for population sizes `n1` and `n2` with more entries than
    `ncases` exists, all the stored values are returned.
    Otherwise, new cases are computed and stored, then returned.

    Parameters
    ----------
    n1 : number of samples in population 1
    n2 : number of samples in population 2
    ncases : number of populations to generate
    mean -- mean of population 1
    std -- standard deviation of population 1
    path : path to the m-test tables (see `get_tables_path`)

    Returns
    -------
    m_test_values : 1D array of m-test statistics, containing *at least*
                    `ncases` elements, but possibly more
    t_test_values : 1D array of t-test statistics, containing *at least*
                    `ncases` elements, but possibly more
    """

    fname = os.path.join(get_tables_path(path),
                         TYPEII_TABLESNAME%(n1,n2,mean,std))
    if os.path.exists(fname):
        logging.debug('Loading type I table %s', fname)
        npzfile = sp.load(fname)
        m_test_values = npzfile['m_test_values'].flatten()
        t_test_values = npzfile['t_test_values'].flatten()
    else:
        m_test_values = sp.array([])
        t_test_values = sp.array([])
    
    nvalues = m_test_values.shape[0]
    if nvalues>=ncases:
        return m_test_values, t_test_values

    nmissing = ncases-nvalues
    if nmissing > 0:
        logging.debug('Requested %d cases, found %d, missing %d',
                      ncases, nvalues, nmissing)
        print 'The requested mtest table is incomplete.'
        print ('Need to process %d additional cases, this may take some time.'
               % nmissing)
    
        # compute missing entries
        pop1_test, pop2_test = _random_different_mean(n1, n2, nmissing,
                                                      mean, std)

        m_missing_values = sp.zeros((nmissing,))
        t_missing_values = sp.zeros((nmissing,))
        for i in progressinfo(range(nmissing), style='timer'):
            m_missing_values[i] = mtest_marginal_likelihood_ratio(
                pop1_test[i,:], pop2_test[i,:], nprior=_NPRIOR)
            t_missing_values[i] = stats.ttest_ind(pop1_test[i,:],
                                                  pop2_test[i,:])[1]

        # update and save table
        m_test_values = sp.concatenate((m_test_values, m_missing_values))
        t_test_values = sp.concatenate((t_test_values, t_missing_values))
        logging.debug('Saving updated table %s', fname)
        sp.savez(fname, m_test_values=m_test_values, t_test_values=t_test_values)

    return m_test_values, t_test_values
Ejemplo n.º 3
0
def dofile(filepath):
    polc = 0
    comc = 0
    inserts = 0
    #infile = file(filepath).read()
    fi = FileInfo(filepath)
    print fi.FileName
    if not fi.sanity_check() and not skip_sanity:
        print 'Sanity Check Failed!', fi.SanityFail, fi.CycleNumber
        print 'Skipping this file'
        return
        #sys.exit(1)
    if not skip_fileinfo:
        try:
            file_id = fi.addfileinfo()
        except sa.exceptions.IntegrityError:
            print 'Already processed this file'
            return
    else:
        file_id = 0
    infile = fi.in_lines
    if use_mdp:
        iter = progress_bar.progressinfo(infile)
    else:
        iter = infile
    start = time.time()
    ROWCHUNKS = 10000
    rc = 0
    comsql = []
    polsql = []
    polrowsleft = fi.NumPolRecs
    comrowsleft = fi.NumComRecs
    for line in iter:
        rc += 1
        line = line.strip()
        if line[0:2] == '10':
            polc += 1
            polrowsleft -= 1
            dict = poltranscols
            polsql.append(line2sqldict(line, dict, file_id))
            if math.fmod(len(polsql),
                         ROWCHUNKS) == 0 or polrowsleft <= ROWCHUNKS:
                inserts += write2table(polsql, dict)
                polsql = []
        elif line[0:2] == '20':
            comc += 1
            comrowsleft -= 1
            dict = comtranscols
            comsql.append(line2sqldict(line, dict, file_id))
            if math.fmod(len(comsql),
                         ROWCHUNKS) == 0 or comrowsleft <= ROWCHUNKS:
                inserts += write2table(comsql, dict)
                comsql = []
        elif line[0:2] == '00':
            dict = headcols
        elif line[0:2] == '99':
            dict = tailcols
        else:
            print 'Unidentified line!'
            sys.exit(1)
    del iter, infile
    ttime = time.time() - start
    print '%s: comrecs: %d, polrec: %d, total: %d, inserts: %d (time=%d)' \
                % (fi.FileName, comc, polc, comc+polc, inserts, ttime)
    if not fi.final_check(polc, comc):
        print 'Failed Final Check!', fi.SanityFail
        sys.exit(1)
    fi.set_checksum()