Exemple #1
0
    def __iter__(self):

        # prefer implementation using xlutils.view as dates are automatically converted
        if self.use_view:
            try:
                import xlutils.view
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message_utils)
            else:
                wb = xlutils.view.View(self.filename)
                if self.sheet is None:
                    ws = wb[0]
                else:
                    ws = wb[self.sheet]
                return (tuple(row) for row in ws)

        else:
            try:
                import xlrd
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            else:
                with xlrd.open_workbook(filename=self.filename,
                                        on_demand=True) as wb:
                    if self.sheet is None:
                        ws = wb.sheet_by_index(0)
                    elif isinstance(self.sheet, int):
                        ws = wb.sheet_by_index(self.sheet)
                    else:
                        ws = wb.sheet_by_name(str(self.sheet))
                    return (tuple(ws.row_values(rownum))
                            for rownum in range(ws.nrows))
Exemple #2
0
def tupletrees(table, facet, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in the tree is a row of the table.

    """

    try:
        import bx.intervals
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    it = iter(table)
    fields = it.next()
    assert start in fields, 'start field not recognised'
    assert stop in fields, 'stop field not recognised'
    getstart = itemgetter(fields.index(start))
    getstop = itemgetter(fields.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(fields, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(fields, facet)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = bx.intervals.intersection.IntervalTree()
        trees[k].add(getstart(row), getstop(row), getvalue(row))
    return trees
Exemple #3
0
    def __iter__(self):
        try:
            import vcf as pyvcf
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)
        reader = pyvcf.Reader(filename=self.filename)

        # determine header
        if isinstance(self.samples, (list, tuple)):
            # specific samples requested
            yield fixed_fields + tuple(self.samples)
        elif self.samples:
            # all samples
            yield fixed_fields + tuple(reader.samples)
        else:
            # no samples
            yield fixed_fields

        # fetch region?
        if None not in {self.chrom, self.start}:
            it = reader.fetch(self.chrom, self.start, self.end)
        else:
            it = reader

        # yield data
        for rec in it:
            out = tuple(getattr(rec, f) for f in fixed_fields)
            if isinstance(self.samples, (list, tuple)):
                # specific samples requested
                out += tuple(rec.genotype(s) for s in self.samples)
            elif self.samples:
                # all samples
                out += tuple(rec.samples)
            yield out
Exemple #4
0
def todataframe(table,
                index=None,
                exclude=None,
                columns=None,
                coerce_float=False,
                nrows=None):
    """
    Convenience function to load data from the given `table` into a pandas DataFrame.

    .. versionadded:: 0.14

    """
    try:
        import pandas as pd
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        l = list(table)
        data = l[1:]
        if columns is None:
            columns = l[0]
        return pd.DataFrame.from_records(data,
                                         index=index,
                                         exclude=exclude,
                                         columns=columns,
                                         coerce_float=coerce_float,
                                         nrows=nrows)
Exemple #5
0
def display(tbl, limit=None, **kwargs):
    """
    Display a table inline within an iPython notebook. E.g.::
    
        In [0]: from petlx.ipython import display
                tbl = [['foo', 'bar'], ['a', 1], ['b', 2]]
                display(tbl)
                
    Alternatively, using the fluent style::
    
        In [0]: import petl.interactive as etl
                import petlx.ipython
                tbl = etl.wrap([['foo', 'bar'], ['a', 1], ['b', 2]])
                tbl.display()
                
    .. versionadded:: 0.5  
    
    """

    try:
        from IPython.core.display import display_html
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        html = repr_html(tbl, limit=limit, **kwargs)
        display_html(html, raw=True)
Exemple #6
0
def _get_hdf5_table(source, where, name, mode='r'):

    try:
        import tables
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    # allow for polymorphic args
    if isinstance(source, tables.Table):
        h5file = None
        h5tbl = source
    else:
        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source, mode=mode)
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception(
                'invalid source argument, expected file name or tables.File or tables.Table object, found: %r'
                % source)
        h5tbl = h5file.getNode(where, name=name)
        assert isinstance(h5tbl,
                          tables.Table), 'node is not a table: %r' % h5tbl
    return h5file, h5tbl
Exemple #7
0
def unpackcall(tbl, *keys, **kwargs):
    """
    Unpack the call column. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = unpackcall(t2)
        >>> t4 = cutout(t3, 'INFO')
        >>> look(t4)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'GT'  | 'GQ' | 'DP' | 'HQ'         |
        +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | '0|0' |   48 |    1 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | '1|0' |   48 |    8 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | '1/1' |   43 |    5 | [None, None] |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | '0|0' |   49 |    3 | [58, 50]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all FORMAT
            keys = reader.formats.keys()
        else:
            tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()
                          )  # enable sampling of keys from data
    result = unpackdict(tbl, 'CALL', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
Exemple #8
0
 def __init__(self, tree=None, proximity=0):
     try:
         import bx.intervals
     except ImportError as e:
         raise UnsatisfiedDependency(e, dep_message)
     if tree is None:
         self.tree = bx.intervals.intersection.IntervalTree()
     else:
         self.tree = tree
     self.proximity = proximity
Exemple #9
0
def torecarray(*args, **kwargs):
    """
    Convenient shorthand for ``toarray(...).view(np.recarray)``.
    
    .. versionadded:: 0.5.1
    
    """
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        return toarray(*args, **kwargs).view(np.recarray)
Exemple #10
0
def guessdtype(table):
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        # get numpy to infer dtype
        it = iter(table)
        fields = it.next()
        rows = tuple(it)
        dtype = np.rec.array(rows).dtype
        dtype.names = fields
        return dtype
Exemple #11
0
def iterindex(index_or_dirname, indexname, docnum_field):
    try:
        import whoosh
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        if isinstance(index_or_dirname, basestring):
            dirname = index_or_dirname
            index = whoosh.index.open_dir(dirname,
                                          indexname=indexname,
                                          readonly=True)
            needs_closing = True
        elif isinstance(index_or_dirname, whoosh.index.Index):
            index = index_or_dirname
            needs_closing = False
        else:
            raise Exception('expected string or index, found %r' %
                            index_or_dirname)

        try:

            if docnum_field is None:

                # figure out the field names
                fields = tuple(index.schema.stored_names())
                yield fields

                # yield all documents
                astuple = operator.itemgetter(*index.schema.stored_names())
                for _, stored_fields_dict in index.reader().iter_docs():
                    yield astuple(stored_fields_dict)

            else:

                # figure out the field names
                fields = (docnum_field, ) + tuple(index.schema.stored_names())
                yield fields

                # yield all documents
                astuple = operator.itemgetter(*index.schema.stored_names())
                for docnum, stored_fields_dict in index.reader().iter_docs():
                    yield (docnum, ) + astuple(stored_fields_dict)

        except:
            raise

        finally:
            if needs_closing:
                # close the index if we're the ones who opened it
                index.close()
Exemple #12
0
def valuestoarray(vals, dtype=None, count=-1, sample=1000):

    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(vals)

        if dtype is None:
            peek, it = iterpeek(it, sample)
            dtype = np.array(peek).dtype

        a = np.fromiter(it, dtype=dtype, count=count)
        return a
Exemple #13
0
def toxlsx(tbl, filename, sheet=None, encoding='utf-8'):
    """
    Write a table to a new Excel (.xlsx) file.

    .. versionadded:: 0.15

    """

    try:
        import openpyxl
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    wb = openpyxl.Workbook(optimized_write=True, encoding=encoding)
    ws = wb.create_sheet(title=sheet)
    for row in tbl:
        ws.append(row)
    wb.save(filename)
Exemple #14
0
def toxls(tbl,
          filename,
          sheet,
          encoding='ascii',
          style_compression=0,
          styles=None):
    """
    Write a table to a new Excel (.xls) file.

    .. versionadded:: 0.15

    """

    try:
        import xlwt
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message_write)
    else:
        wb = xlwt.Workbook(encoding=encoding,
                           style_compression=style_compression)
        ws = wb.add_sheet(sheet)

        if styles is None:
            # simple version, don't worry about styles
            for r, row in enumerate(tbl):
                for c, label in enumerate(row):
                    ws.write(r, c, label=label)
        else:
            # handle styles
            it = iter(tbl)
            fields = it.next()
            for c, label in enumerate(fields):
                ws.write(0, c, label=label)
                if label not in styles:
                    styles[label] = xlwt.Style.default_style
            # convert to list for easy zipping
            styles = [styles[f] for f in fields]
            for r, row in enumerate(it):
                for c, (label, style) in enumerate(
                        izip_longest(row, styles, fillvalue=None)):
                    if style is None:
                        style = xlwt.Style.default_style
                    ws.write(r + 1, c, label=label, style=style)

        wb.save(filename)
Exemple #15
0
def recordtree(table, start='start', stop='stop'):
    """
    Construct an interval tree for the given table, where each node in the tree is a row of the table represented
    as a hybrid tuple/dictionary-style record object.

    """

    try:
        import bx.intervals
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    getstart = attrgetter(start)
    getstop = attrgetter(stop)

    tree = bx.intervals.intersection.IntervalTree()
    for rec in records(table):
        tree.add(getstart(rec), getstop(rec), rec)
Exemple #16
0
 def __iter__(self):
     try:
         from pysam import Tabixfile, asTuple
     except ImportError as e:
         raise UnsatisfiedDependency(e, dep_message)
     f = Tabixfile(self.filename, mode='r')
     try:
         # header row
         if self.header is not None:
             yield self.header
         else:
             # assume last header line has fields
             h = list(f.header)
             if len(h) > 0:
                 yield tuple(h[-1].split('\t'))
         # data rows
         for row in f.fetch(reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple()):
             yield tuple(row)
     except:
         raise
     finally:
         f.close()
Exemple #17
0
def make_sqlalchemy_table(table, tablename, schema=None, constraints=True, metadata=None):
    """
    Create an SQLAlchemy table based on a :mod:`petl` table.

    Parameters
    ----------

    table : sequence of sequences (petl table)
        Table data to use to infer types etc.
    tablename : string
        Name of the table
    schema : string
        Name of the database schema to create the table in
    constraints : bool
        If True use length and nullable constraints
    metadata : sqlalchemy.MetaData
        Custom table metadata

    """

    try:
        import sqlalchemy
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    if not metadata:
        metadata = sqlalchemy.MetaData()

    sql_table = sqlalchemy.Table(tablename, metadata, schema=schema)

    fields = header(table)
    cols = columns(table)

    for f in fields:
        sql_column = make_sqlalchemy_column(cols[f], f, constraints=constraints)
        sql_table.append_column(sql_column)

    return sql_table
Exemple #18
0
    def __iter__(self):
        try:
            import openpyxl
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)

        use_iterators = self.range is None
        wb = openpyxl.load_workbook(filename=self.filename,
                                    use_iterators=use_iterators,
                                    **self.kwargs)
        if self.sheet is None:
            ws = wb.get_sheet_by_name(wb.get_sheet_names()[0])
        elif isinstance(self.sheet, int):
            ws = wb.get_sheet_by_name(wb.get_sheet_names()[self.sheet])
        else:
            ws = wb.get_sheet_by_name(str(self.sheet))

        if self.range is not None:
            return (tuple(cell.value for cell in row)
                    for row in ws.range(self.range))
        else:
            return (tuple(cell.value for cell in row)
                    for row in ws.iter_rows())
Exemple #19
0
def recordtrees(table, facet, start='start', stop='stop'):
    """
    Construct faceted interval trees for the given table, where each node in the tree is a row of the table represented
    as a hybrid tuple/dictionary-style record object.

    """

    try:
        import bx.intervals
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    getstart = attrgetter(start)
    getstop = attrgetter(stop)
    getkey = attrgetter(facet)

    trees = dict()
    for rec in records(table):
        k = getkey(rec)
        if k not in trees:
            trees[k] = bx.intervals.intersection.IntervalTree()
        trees[k].add(getstart(rec), getstop(rec), rec)
    return trees
Exemple #20
0
def make_create_table_statement(table, tablename, schema=None, constraints=True, metadata=None, dialect=None):
    """
    Generate a CREATE TABLE statement based on a :mod:`petl` table.

    Parameters
    ----------

    table : sequence of sequences (petl table)
        Table data to use to infer types etc.
    tablename : string
        Name of the table
    schema : string
        Name of the database schema to create the table in
    constraints : bool
        If True use length and nullable constraints
    metadata : sqlalchemy.MetaData
        Custom table metadata
    dialect : string
        One of {'access', 'sybase', 'sqlite', 'informix', 'firebird', 'mysql', 'oracle', 'maxdb', 'postgresql', 'mssql'}

    """

    try:
        import sqlalchemy
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    sql_table = make_sqlalchemy_table(table, tablename, schema=schema, constraints=constraints, metadata=metadata)

    if dialect:
        module = __import__('sqlalchemy.dialects.%s' % DIALECTS[dialect], fromlist=['dialect'])
        sql_dialect = module.dialect()
    else:
        sql_dialect = None

    return unicode(sqlalchemy.schema.CreateTable(sql_table).compile(dialect=sql_dialect)).strip() + ';'
Exemple #21
0
def make_sqlalchemy_column(col, colname, constraints=True):
    """
    Infer an appropriate SQLAlchemy column type based on a sequence of values.

    Parameters
    ----------

    col : sequence
        A sequence of values to use to infer type, length etc.
    colname : string
        Name of column
    constraints : bool
        If True use length and nullable constraints

    """

    try:
        import sqlalchemy
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    col_not_none = [v for v in col if v is not None]
    sql_column_kwargs = {}
    sql_type_kwargs = {}

    if len(col_not_none) == 0:
        sql_column_type = sqlalchemy.String
        if constraints:
            sql_type_kwargs['length'] = NULL_COLUMN_MAX_LENGTH

    elif all(isinstance(v, bool) for v in col_not_none):
        sql_column_type = sqlalchemy.Boolean

    elif all(isinstance(v, int) for v in col_not_none):
        if max(col_not_none) > SQL_INTEGER_MAX or min(col_not_none) < SQL_INTEGER_MIN:
            sql_column_type = sqlalchemy.BigInteger
        else:
            sql_column_type = sqlalchemy.Integer

    elif all(isinstance(v, long) for v in col_not_none):
        sql_column_type = sqlalchemy.BigInteger

    elif all(isinstance(v, (int, long)) for v in col_not_none):
        sql_column_type = sqlalchemy.BigInteger

    elif all(isinstance(v, (int, long, float)) for v in col_not_none):
        sql_column_type = sqlalchemy.Float

    elif all(isinstance(v, datetime.date) for v in col_not_none):
        sql_column_type = sqlalchemy.Date

    elif all(isinstance(v, datetime.time) for v in col_not_none):
        sql_column_type = sqlalchemy.Time

    elif all(isinstance(v, datetime.datetime) for v in col_not_none):
        sql_column_type = sqlalchemy.DateTime

    else:
        sql_column_type = sqlalchemy.String
        if constraints:
            sql_type_kwargs['length'] = max([len(unicode(v)) for v in col])

    if constraints:
        sql_column_kwargs['nullable'] = len(col_not_none) < len(col)

    return sqlalchemy.Column(colname, sql_column_type(**sql_type_kwargs), **sql_column_kwargs)
Exemple #22
0
def tohdf5(table,
           source,
           where=None,
           name=None,
           create=False,
           description=None,
           title='',
           filters=None,
           expectedrows=10000,
           chunkshape=None,
           byteorder=None,
           createparents=False,
           sample=1000):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, any existing table is dropped, and a new table is created;
    if `description` is None, the datatype will be guessed. E.g.::
    
        >>> from petl import look
        >>> look(table1)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+
        
        >>> from petlx.hdf5 import tohdf5, fromhdf5
        >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True)
        >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable'))
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+

    See also :func:`appendhdf5`.
    
    .. versionadded:: 0.3
    
    """

    it = iter(table)

    if create:

        try:
            import tables
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)

        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source,
                                     mode='a')  # don't replace the whole file!
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception(
                'invalid source argument, expected file name or tables.File, found: %r'
                % source)

        # determine datatype
        if description is None:
            peek, it = iterpeek(it, sample)
            # use a numpy dtype
            description = guessdtype(peek)

        # check if the table node already exists
        try:
            h5table = h5file.getNode(where, name)
        except tables.NoSuchNodeError:
            pass
        else:
            # drop the node
            h5file.removeNode(where, name)

        # create the table
        h5table = h5file.createTable(where,
                                     name,
                                     description,
                                     title=title,
                                     filters=filters,
                                     expectedrows=expectedrows,
                                     chunkshape=chunkshape,
                                     byteorder=byteorder,
                                     createparents=createparents)

    else:
        h5file, h5table = _get_hdf5_table(source, where, name, mode='a')

    try:
        # truncate the existing table
        h5table.truncate(0)

        # load the data
        _insert(it, h5table)

    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()
Exemple #23
0
def toindex(tbl,
            index_or_dirname,
            schema=None,
            indexname=None,
            merge=False,
            optimize=False):
    """
    Load all rows from `tbl` into a Whoosh index. N.B., this will clear any
    existing data in the index before loading. E.g.::

        >>> from petl import look
        >>> from petlx.index import toindex, fromindex
        >>> # here is the table we want to load into an index
        ... look(tbl)
        +--------+------+------+-------+--------------------------------------------------+
        | 'f0'   | 'f1' | 'f2' | 'f3'  | 'f4'                                             |
        +========+======+======+=======+==================================================+
        | u'AAA' |   12 |  4.3 | True  | datetime.datetime(2014, 6, 30, 14, 7, 2, 333199) |
        +--------+------+------+-------+--------------------------------------------------+
        | u'BBB' |    6 |  3.4 | False | datetime.datetime(1900, 1, 31, 0, 0)             |
        +--------+------+------+-------+--------------------------------------------------+
        | u'CCC' |   42 |  7.8 | True  | datetime.datetime(2100, 12, 25, 0, 0)            |
        +--------+------+------+-------+--------------------------------------------------+

        >>> # define a schema for the index
        ... from whoosh.fields import *
        >>> schema = Schema(f0=TEXT(stored=True),
        ...                 f1=NUMERIC(int, stored=True),
        ...                 f2=NUMERIC(float, stored=True),
        ...                 f3=BOOLEAN(stored=True),
        ...                 f4=DATETIME(stored=True))
        >>> # load data
        ... toindex(tbl, 'tmp/example', schema=schema)
        >>> # look what it did
        ... look(fromindex('tmp/example'))
        +--------+------+------+-------+--------------------------------------------------+
        | 'f0'   | 'f1' | 'f2' | 'f3'  | 'f4'                                             |
        +========+======+======+=======+==================================================+
        | u'AAA' |   12 |  4.3 | True  | datetime.datetime(2014, 6, 30, 14, 7, 2, 333199) |
        +--------+------+------+-------+--------------------------------------------------+
        | u'BBB' |    6 |  3.4 | False | datetime.datetime(1900, 1, 31, 0, 0)             |
        +--------+------+------+-------+--------------------------------------------------+
        | u'CCC' |   42 |  7.8 | True  | datetime.datetime(2100, 12, 25, 0, 0)            |
        +--------+------+------+-------+--------------------------------------------------+

    .. versionadded:: 0.16

    Parameters
    ----------

    tbl
        A table-like object (row container) containing the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    try:
        import whoosh
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        # deal with polymorphic argument
        if isinstance(index_or_dirname, basestring):
            dirname = index_or_dirname
            index = whoosh.index.create_in(dirname,
                                           schema,
                                           indexname=indexname)
            needs_closing = True
        elif isinstance(index_or_dirname, whoosh.index.Index):
            index = index_or_dirname
            needs_closing = False
        else:
            raise Exception('expected string or index, found %r' %
                            index_or_dirname)

        writer = index.writer()
        try:

            for d in dicts(tbl):
                writer.add_document(**d)
            writer.commit(merge=merge,
                          optimize=optimize,
                          mergetype=whoosh.writing.CLEAR)

        except:
            writer.cancel()
            raise

        finally:
            if needs_closing:
                index.close()
Exemple #24
0
def appendindex(tbl,
                index_or_dirname,
                indexname=None,
                merge=True,
                optimize=False):
    """
    Load all rows from `tbl` into a Whoosh index, adding them to any existing
    data in the index.

    .. versionadded:: 0.16

    Parameters
    ----------

    tbl
        A table-like object (row container) containing the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    try:
        import whoosh
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        # deal with polymorphic argument
        if isinstance(index_or_dirname, basestring):
            dirname = index_or_dirname
            index = whoosh.index.open_dir(dirname,
                                          indexname=indexname,
                                          readonly=False)
            needs_closing = True
        elif isinstance(index_or_dirname, whoosh.index.Index):
            index = index_or_dirname
            needs_closing = False
        else:
            raise Exception('expected string or index, found %r' %
                            index_or_dirname)

        writer = index.writer()
        try:

            for d in dicts(tbl):
                writer.add_document(**d)
            writer.commit(merge=merge, optimize=optimize)

        except Exception as e:
            writer.cancel()
            raise

        finally:
            if needs_closing:
                index.close()
Exemple #25
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Convenience function to load data from the given `table` into a numpy 
    structured array. E.g.::

        >>> from petl import look
        >>> from petlx.array import toarray
        >>> look(table)
        +-----------+-------+-------+
        | 'foo'     | 'bar' | 'baz' |
        +===========+=======+=======+
        | 'apples'  | 1     | 2.5   |
        +-----------+-------+-------+
        | 'oranges' | 3     | 4.4   |
        +-----------+-------+-------+
        | 'pears'   | 7     | 0.1   |
        +-----------+-------+-------+
        
        >>> a = toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], 
              dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')])
        >>> a['foo']
        array(['apples', 'oranges', 'pears'], 
              dtype='|S7')
        >>> a['bar']
        array([1, 3, 7])
        >>> a['baz']
        array([ 2.5,  4.4,  0.1])
        >>> a['foo'][0]
        'apples'
        >>> a['bar'][1]
        3
        >>> a['baz'][2]
        0.10000000000000001
        
    If no datatype is specified, `sample` rows will be examined to infer an
    appropriate datatype for each field.
        
    The datatype can be specified as a string, e.g.:

        >>> a = toarray(table, dtype='a4, i2, f4')
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432),
               ('pear', 7, 0.10000000149011612)], 
              dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')])

    The datatype can also be partially specified, in which case datatypes will
    be inferred for other fields, e.g.:
    
        >>> a = toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], 
              dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')])
    
    """

    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(table)
        peek, it = iterpeek(it, sample)
        fields = it.next()

        if dtype is None:
            dtype = guessdtype(peek)

        elif isinstance(dtype, basestring):
            # insert field names from source table
            typestrings = [s.strip() for s in dtype.split(',')]
            dtype = [(f, t) for f, t in zip(fields, typestrings)]

        elif (isinstance(dtype, dict)
              and ('names' not in dtype or 'formats' not in dtype)):
            # allow for partial specification of dtype
            cols = columns(peek)
            newdtype = {'names': [], 'formats': []}
            for f in fields:
                newdtype['names'].append(f)
                if f in dtype and isinstance(dtype[f], tuple):
                    # assume fully specified
                    newdtype['formats'].append(dtype[f][0])
                elif f not in dtype:
                    # not specified at all
                    a = np.array(cols[f])
                    newdtype['formats'].append(a.dtype)
                else:
                    # assume directly specified, just need to add offset
                    newdtype['formats'].append(dtype[f])
            dtype = newdtype

        else:
            pass  # leave dtype as-is

        # numpy is fussy about having tuples, need to make sure
        it = (tuple(row) for row in it)
        sa = np.fromiter(it, dtype=dtype, count=count)

        return sa
Exemple #26
0
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen,
                    indexname, docnum_field, score_field, fieldboosts,
                    search_kwargs):
    try:
        import whoosh
        import whoosh.qparser
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        if isinstance(index_or_dirname, basestring):
            dirname = index_or_dirname
            index = whoosh.index.open_dir(dirname,
                                          indexname=indexname,
                                          readonly=True)
            needs_closing = True
        elif isinstance(index_or_dirname, whoosh.index.Index):
            index = index_or_dirname
            needs_closing = False
        else:
            raise Exception('expected string or index, found %r' %
                            index_or_dirname)

        try:

            # figure out header
            fields = tuple()
            if docnum_field is not None:
                fields += (docnum_field, )
            if score_field is not None:
                fields += (score_field, )
            stored_names = tuple(index.schema.stored_names())
            fields += stored_names
            yield fields

            # parse the query
            if isinstance(query, basestring):
                # search all fields by default
                parser = whoosh.qparser.MultifieldParser(
                    index.schema.names(),
                    index.schema,
                    fieldboosts=fieldboosts)
                query = parser.parse(query)
            elif isinstance(query, whoosh.query.Query):
                pass
            else:
                raise Exception(
                    'expected string or whoosh.query.Query, found %r' % query)

            # make a function to turn docs into tuples
            astuple = operator.itemgetter(*index.schema.stored_names())

            with index.searcher() as searcher:
                if limit is not None:
                    results = searcher.search(query,
                                              limit=limit,
                                              **search_kwargs)
                else:
                    results = searcher.search_page(query,
                                                   pagenum,
                                                   pagelen=pagelen,
                                                   **search_kwargs)

                if docnum_field is None and score_field is None:

                    for doc in results:
                        yield astuple(doc)

                else:

                    for (docnum,
                         score), doc in itertools.izip(results.items(),
                                                       results):
                        row = tuple()
                        if docnum_field is not None:
                            row += (docnum, )
                        if score_field is not None:
                            row += (score, )
                        row += astuple(doc)
                        yield row

        except:
            raise

        finally:
            if needs_closing:
                # close the index if we're the ones who opened it
                index.close()
Exemple #27
0
def unpackinfo(tbl, *keys, **kwargs):
    """
    Unpack the INFO field into separate fields. E.g.::

        >>> from petlx.vcf import fromvcf, unpackinfo
        >>> from petl import look
        >>> t1 = fromvcf('../fixture/sample.vcf', samples=False)
        >>> look(t1)
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'INFO'                                                                                  |
        +=========+=========+=============+=======+===========+========+==========+=========================================================================================+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)])         |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])])                                   |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')])                                       |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])])             |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        
        >>> t2 = unpackinfo(t1)
        >>> look(t2)
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC'   | 'DP' | 'AF'           | 'AA' | 'DB' | 'H2' |
        +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       |    3 | None | None   |   14 | [0.5]          | None | True | True |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  |    3 | None | None   |   11 | [0.017]        | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       |    2 | None | None   |   10 | [0.333, 0.667] | 'T'  | True | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       |    3 | None | None   |   13 | None           | 'T'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       |    3 |    6 | [3, 1] |    9 | None           | 'G'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
    
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all INFO
            keys = reader.infos.keys()
    result = unpackdict(tbl, 'INFO', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result