def insert_metadata(datasets):
    fetch = lambda x: geo.fetch(x, destdir='data/')
    with closing(dbconnect()) as db:
        with closing(db.cursor()) as c:
            for ds in datasets:
                dataset = fetch(ds)
                did = dataset.id
                desc = dataset.meta['description']
                pmid = dataset.meta.get('pubmed_id', None)
                print "fetching abstract..."
                abstract = (Entrez.efetch(id=pmid, db='pubmed',
                    rettype='abstract', retmode='text').read() 
                    if pmid else None)
                print "inserting metadata..."
                c.execute("REPLACE INTO metadata VALUES(%s, %s, %s, %s);", 
                    (did, desc, pmid, abstract))
                print "fetching annotations..."
                desc_annos = annotate(desc)
                
                abst_annos = (annotate(abstract.replace('\n', ' ')) 
                    if abstract else None)
                print "inserting annotations...",
                desc_sql = ("REPLACE INTO annotations(dataset, goid, term, "
                    "source) values('%s', %%s, %%s, '%s')" 
                    % (did, 'description'))
                c.executemany(desc_sql, desc_annos)
                if abstract:
                    abst_sql = ("REPLACE INTO annotations(dataset, goid, term,"
                        "source) values('%s', %%s, %%s, '%s')" 
                        % (did, 'abstract'))
                    c.executemany(abst_sql, abst_annos)
                db.commit()
                print "done.\n"
Ejemplo n.º 2
0
from __future__ import division, print_function, unicode_literals

import argparse
import sys

from __init__ import fetch

p = argparse.ArgumentParser()
p.add_argument('git_repo',
               nargs='?',
               default='.',
               help="if it's bare you need to provide a checkout_dir")
p.add_argument('checkout_dir', nargs='?')
p.add_argument('-v', '--verbose', action='count', default=0)
p.add_argument('-o', '--only', action='append')
args = p.parse_args()

sys.exit(0 if fetch(args.git_repo, args.checkout_dir, args.verbose, args.only
                    ) else 1)
#!/usr/bin/env python
import sys
import enrichment as ea
from multiprocessing import Process
from __init__ import fetch


def _usage():
    print(
        "Corrects results from enrichment analysis for multiple comparison errors. Inserts q-values into results table."
    )
    print("Usage: python fdr_correction.py <GDS file or accn> <[MF, CC, BP]> <anno file 1> [more anno files...]")
    print("\n See README.md for more information.")
    sys.exit(1)


if __name__ == "__main__":
    if len(sys.argv) < 3 or len(sys.argv[2]) != 2:
        _usage()
    jobs = []
    dataset = fetch(sys.argv[1], destdir="data")

    for year in sys.argv[3:]:
        p = Process(target=ea.multitest_correction, args=(dataset, sys.argv[2], [year]))
        jobs.append(p)
        p.start()
    [p.join() for p in jobs]  # wait for them all to finish
Ejemplo n.º 4
0
def test():
    # index conversion
    print('index:', coda.c_index_to_fortran_index([4, 10], 9))

    # NaN, Inf
    print(coda.NaN(), coda.MinInf(), coda.PlusInf())
    print(coda.isNaN(0), coda.isNaN(coda.NaN()))
    print(coda.isInf(0), coda.isInf(coda.PlusInf()), coda.isInf(coda.MinInf()))
    print(coda.isMinInf(0), coda.isMinInf(coda.MinInf()),
          coda.isMinInf(coda.PlusInf()))
    print(coda.isPlusInf(0), coda.isPlusInf(coda.PlusInf()),
          coda.isPlusInf(coda.MinInf()))

    # open product
    what = coda.recognize_file('madis-raob.nc')
    print('what:', what)
    product = coda.open('madis-raob.nc')

    print('class', coda.get_product_class(product))
    print('type', coda.get_product_type(product))
    print('version', coda.get_product_version(product))
    print('deffile', coda.get_product_definition_file(product))
    print('file size', coda.get_product_file_size(product))
    print('filename', coda.get_product_filename(product))
    print('format', coda.get_product_format(product))
    print('root type',
          coda.type_get_class(coda.get_product_root_type(product)))

    # fetch array
    array = coda.fetch(product, 'tpTropQCD')
    print(array)

    # fetch scalar
    scalar = coda.fetch(product, 'globalInventory')
    print(scalar)

    # read scalar int32
    cursor = coda.Cursor()
    coda.cursor_set_product(cursor, product)

    print('cursor has attrs', coda.cursor_has_attributes(cursor))
    type_ = coda.cursor_get_type(cursor)
    print('type has attrs', coda.type_has_attributes(type_))
    a = coda.type_get_attributes(type_)
    print('attrs type name', coda.type_get_class_name(coda.type_get_class(a)))

    coda.cursor_goto(cursor, 'globalInventory')

    gi = coda.cursor_read_int32(cursor)
    print('globalInventory:', gi)
    try:
        coda.cursor_read_uint16(cursor)
    except coda.CodacError as e:
        print(str(e))

    coda.cursor_goto_root(cursor)
    expr = coda.expression_from_string('2 * int(./globalInventory)')
    print('expr:', coda.expression_eval_integer(expr, cursor))
    coda.expression_delete(expr)

    # read double array
    coda.cursor_goto_root(cursor)
    type_ = coda.cursor_get_type(cursor)
    cl = coda.cursor_get_type_class(cursor)
    print('cl', cl)
    print('num recs:', coda.type_get_num_record_fields(type_))
    print('nt name:',
          coda.type_get_native_type_name(coda.type_get_read_type(type_)))
    print('0 hidden:', coda.type_get_record_field_hidden_status(type_, 0))
    print('0 available:',
          coda.type_get_record_field_available_status(type_, 0))
    print('field type:',
          coda.type_get_class(coda.type_get_record_field_type(type_, 0)))
    print('field name:', coda.type_get_record_field_name(type_, 0))
    print('field realname:', coda.type_get_record_field_real_name(type_, 0))
    print('index:',
          coda.type_get_record_field_index_from_name(type_, 'nStaticIds'))
    print('index:',
          coda.cursor_get_record_field_index_from_name(cursor, 'nStaticIds'))
    print('indexreal:',
          coda.type_get_record_field_index_from_real_name(type_, 'nStaticIds'))
    print('has attributes:', coda.type_has_attributes(type_))
    print('unit:', coda.type_get_unit(type_))
    print('union status:', coda.type_get_record_union_status(type_))
    print('union av i:', coda.cursor_get_available_union_field_index(cursor))
    print('product file',
          coda.get_product_filename(coda.cursor_get_product_file(cursor)))

    coda.cursor_goto_root(cursor)
    coda.cursor_goto_record_field_by_index(cursor, 0)
    coda.cursor_goto_root(cursor)
    coda.cursor_goto_record_field_by_name(cursor, 'nStaticIds')

    coda.cursor_goto_root(cursor)
    coda.cursor_goto(cursor, 'tpTropQCD')
    coda.cursor_goto_parent(cursor)
    coda.cursor_goto(cursor, 'tpTropQCD')

    shape = coda.cursor_get_array_dim(cursor)
    print('shape:', shape)

    type_ = coda.cursor_get_type(cursor)
    print('numdims:', coda.type_get_array_num_dims(type_))
    print('dims:', coda.type_get_array_dim(type_))
    #print('attrs:', coda.type_get_attributes(type_))
    print('bitsize:', coda.type_get_bit_size(type_))
    print('class:', coda.type_get_class_name(coda.type_get_class(type_)))
    print('fixed:', coda.type_get_fixed_value(type_))
    print('format:', coda.type_get_format(type_))
    print('formatname:',
          coda.type_get_format_name(coda.type_get_format(type_)))
    print('name:', coda.type_get_name(type_))

    array = coda.cursor_read_double_array(cursor)
    print(array)

    #get specific element
    coda.cursor_goto_array_element(cursor, [0, 1, 7])
    print(coda.cursor_read_double(cursor))

    # read string
    coda.cursor_goto_root(cursor)
    coda.cursor_goto(cursor, 'staName')
    type_ = coda.cursor_get_type(cursor)
    print(coda.type_get_name(type_))
    print('dims:', coda.type_get_array_dim(type_))
    coda.cursor_goto_first_array_element(cursor)
    print(coda.cursor_get_string_length(cursor))
    print(repr(coda.cursor_read_string(cursor)))

    # read double partial array
    coda.cursor_goto_root(cursor)
    coda.cursor_goto(cursor, 'tpTropQCD')

    array = coda.cursor_read_double_partial_array(cursor, 10, 22)
    print(array.shape)
    print(array)

    try:
        coda.cursor_goto_available_union_field(cursor)
    except coda.CodacError as e:
        print(str(e))

    # exceptions
    coda.cursor_goto_root(cursor)

    try:
        coda.cursor_goto(cursor, 'zzz')
    except coda.CodacError as e:
        print(str(e))

    try:
        coda.cursor_read_int32(cursor)
    except coda.CodacError as e:
        print(str(e))

    try:
        coda.open('pipo')
    except coda.CodacError as e:
        print(str(e))

    # version
    print(coda.version())

    # node expr
    coda.cursor_goto_root(cursor)
    print('root depth:', coda.cursor_get_depth(cursor))
    expr = coda.expression_from_string('/globalInventory')
    coda.expression_eval_node(expr, cursor)
    print('expr depth:', coda.cursor_get_depth(cursor))
    coda.expression_delete(expr)

    # product class etc
    what = coda.recognize_file(
        'AE_TEST_ALD_U_N_1B_20190105T011602023_008364010_002143_0001.DBL')
    print('what:', what)

    # close
    coda.close(product)

    product = coda.open_as(
        'AE_TEST_ALD_U_N_1B_20190105T011602023_008364010_002143_0001.DBL',
        'AEOLUS', 'ALD_U_N_1B', 9)

    print('class', coda.get_product_class(product))
    print('type', coda.get_product_type(product))
    print('version', coda.get_product_version(product))
    print('description', coda.get_description(product))
    print('attrs', coda.get_attributes(product))

    try:
        coda.get_product_variable_value(product, 'geolocation', 0)
    except coda.CodacError as e:
        print(str(e))

    # product/cursor methods
    cursor = coda.Cursor()
    coda.cursor_set_product(cursor, product)

    print('description', coda.get_description(product))
    print('description', coda.get_description(cursor))

    coda.cursor_goto(cursor, 'geolocation')
    coda.cursor_goto_array_element_by_index(cursor, 0)
    coda.cursor_goto(cursor, 'start_of_observation_time')

    type_ = coda.cursor_get_type(cursor)
    print('T', coda.type_get_special_type(type_))
    print('N', coda.type_get_special_type_name(2))
    print(
        'B',
        coda.type_get_native_type_name(
            coda.type_get_read_type(coda.type_get_special_base_type(type_))))

    coda.cursor_use_base_type_of_special_type(cursor)
    print('bitsize:', coda.cursor_get_bit_size(cursor))
    print('bytesize:', coda.cursor_get_byte_size(cursor))
    print('bitoff', coda.cursor_get_file_bit_offset(cursor))
    print('byteoff', coda.cursor_get_file_byte_offset(cursor))
    print('format', coda.cursor_get_format(cursor))
    data = coda.cursor_read_bytes(cursor, 0, 4)
    print(type(data), data.shape, data.dtype, data)
    data = coda.cursor_read_bits(cursor, 8, 40)
    print(type(data), data.shape, data.dtype, data)

    # expressions
    expr = coda.expression_from_string('1+2')
    print(coda.expression_is_constant(expr))
    print(coda.expression_is_equal(expr, expr))
    result = coda.expression_eval_integer(expr)
    print(result)
    type_ = coda.expression_get_type(expr)
    name = coda.expression_get_type_name(type_)
    print('type', type_, name)
    coda.expression_delete(expr)

    expr = coda.expression_from_string('4.5')
    print(coda.expression_eval_float(expr, cursor))
    coda.expression_delete(expr)

    expr = coda.expression_from_string('true')
    print(coda.expression_eval_bool(expr))
    coda.expression_delete(expr)

    expr = coda.expression_from_string('"bananen" + "vla"')
    print(coda.expression_eval_string(expr))
    coda.expression_delete(expr)

    # time
    parts = coda.time_double_to_parts(12345.67890)
    print(parts)
    parts_utc = coda.time_double_to_parts_utc(12345.67890)
    print(parts_utc)
    s = coda.time_double_to_string(12345.67890, 'yyyy-mm-dd')
    print(s)
    s_utc = coda.time_double_to_string_utc(12345.67890, 'yyyy-mm-dd')
    print(s_utc)

    d = coda.time_parts_to_double(*parts)
    print(d)
    d = coda.time_parts_to_double_utc(*parts_utc)
    print(d)
    s = coda.time_parts_to_string(*parts + ['yyyy-mm-dd'])
    print(s)

    d = coda.time_string_to_double('yyyy-mm-dd', s)
    print(d)
    d = coda.time_string_to_double_utc('yyyy-mm-dd', s)
    print(d)
    parts = coda.time_string_to_parts('yyyy-mm-dd', s)
    print(parts)

    #callback
    def findhelper(filepath, status, error):
        print('match?', filepath, status, error)

    #broken on swig side?


#coda.match_filefilter('', ['/home/srepmub/coda/python/cffi/blup'], findhelper)

    print('index', coda.cursor_get_index(cursor))
    print('has_ascii', coda.cursor_has_ascii_content(cursor))

    #close
    coda.close(product)

    #test self-fabricated product
    product = coda.open('woef.nc')
    cursor = coda.Cursor()
    coda.cursor_set_product(cursor, product)

    # scalar char
    coda.cursor_goto(cursor, 'mychar')
    coda.cursor_goto_first_array_element(cursor)
    type_ = coda.cursor_get_type(cursor)
    print('len', coda.type_get_string_length(type_))
    print(repr(coda.cursor_read_string(cursor)))
    print(coda.fetch(product, 'mychar'))

    coda.close(product)

    #complex numbers
    product = coda.open(
        "MIP_NL__1PYDSI20120401_012202_000060153113_00161_52761_0000.N1")

    cursor = coda.Cursor()
    coda.cursor_set_product(cursor, product)

    print(coda.cursor_get_num_elements(cursor))
    type_ = coda.cursor_get_type(cursor)
    print(coda.type_get_class_name(coda.type_get_class(type_)))

    for i in range(14):
        print(i, coda.type_get_record_field_name(type_, i))

    coda.cursor_goto(cursor, 'mipas_level_1b_mds')
    coda.cursor_goto_array_element_by_index(cursor, 3)
    coda.cursor_goto_record_field_by_name(cursor, 'spike_amp')

    array = coda.cursor_read_complex_array(cursor)
    print(type(array), array.dtype, array.shape, array[50], type(array[50]))
    array = coda.cursor_read_complex_double_pairs_array(cursor)
    print(type(array), array.dtype, array.shape, array[50])
    array = coda.cursor_read_complex_double_split_array(cursor)
    print(type(array), len(array), len(array[0]), type(array[0]),
          array[0].dtype, array[0][50], array[1][50])

    coda.cursor_goto_array_element_by_index(cursor, 50)
    scalar = coda.cursor_read_complex(cursor)
    print(type(scalar), scalar)
    scalar = coda.cursor_read_complex_double_pair(cursor)
    print(type(scalar), scalar.dtype, scalar)
    scalar = coda.cursor_read_complex_double_split(cursor)
    print(type(scalar), scalar)

    coda.close(product)
Ejemplo n.º 5
0
def main(file_or_accn, annotation_files, ontology):
    
    # this file can be downloaded from Uniprot's mapping service
    uniprot2entrez_map = json.load(open(MAPFILE))
    assert len(uniprot2entrez_map) > 27000

    # import the dataset
    dataset = fetch(file_or_accn, destdir='data')
    dataset = dataset.to_numeric()
    dataset.filter().log2xform()

    # import the annotation files (in JSON format)
    annotation_years = (json.load(open(f)) for f in annotation_files)

    # acquire the platform used from the dataset metadata
    platform = fetch(dataset.meta['platform'], destdir='data')

    print("Detected %d cores, splitting into %d subprocesses..." 
        % (NCORES, NCORES))

    if FDR_CORRECTION:
        jobs = []
        for annofile in annotation_files:
            p = Process(target=multitest_correction,
                       args=(dataset, ontology, [annofile]))
            jobs.append(p)
            p.start()
        [p.join() for p in jobs]
        return

    for annotations in annotation_years:
        year = annotations['meta']['year']
        annos = annotations['anno']
        shuffled = annotations['meta'].get('shuffled', 0.0)
        if FILTER_SIMILAR:
            print("Filtering out terms with less than a %d-gene "
                "difference from their parents" % MIN_VARIANCE)
            annos = filter_similar_terms(annos, MIN_VARIANCE)
        if FILTER_BY_DEPTH:
            print("Filtering out terms with fewer than %d "
                "or greater than %d parents" % (MIN_DEPTH, MAX_DEPTH))
            annos = filter_annos_by_depth(annos, MIN_DEPTH, MAX_DEPTH)
        if FILTER_BY_SIZE:
            print("Filtering out annotation gene sets greater than %d "
                "and less than %d" % (ANNO_MAX_SIZE, ANNO_MIN_SIZE))
            annos = filter_annos(annos, ANNO_MAX_SIZE, ANNO_MIN_SIZE)
        filtered_annotations = restrict_subontology(annos, ontology, year)
        blocks = split(filtered_annotations, blocks=NCORES)
        print("Split %d annotations into %d blocks of ~%d terms each..." 
            % (len(filtered_annotations), len(blocks), len(blocks[0])))
        # We're only looking at one factor for this analysis
        # Iterate over factors if this is no longer true
        factor = 'disease state'
        for subset in dataset.factors[factor]:
            print("-- [year: %s] [dataset: %s] [%s: %s] --" 
                % (year, dataset.id, factor, subset))
            jobs = []
            for block in blocks:
                p = Process(target=enriched, 
                    args=(dataset, platform, factor, subset, block, year, 
                        shuffled, len(filtered_annotations), ontology, 
                        uniprot2entrez_map))
                jobs.append(p)
                p.start()
            [p.join() for p in jobs]  # wait for them all to finish