Esempio n. 1
0
def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=True):
    '''Iterator of DataFrames of length chunksize parsed from an
    NCBI BLAST+ `-outfmt6` file.

    Native BLAST+ uses an interval of the form [start,end), start >= 1. This
    changes to [end,start) when on the negative strand, apparently solely
    to make other bioinformaticians suffer.

    We convert to proper 0-based, half-open intervals.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrme with the BLAST+ hits.
    '''

    for group in pd.read_table(fn,
                               header=None,
                               skipinitialspace=True,
                               names=[k for k, _ in blast_cols],
                               delimiter=delimiter,
                               chunksize=chunksize):
        convert_dtypes(group, dict(blast_cols))
        if remap:
            remap_blast(group)
        yield group
Esempio n. 2
0
def crb_to_df_iter(fn, chunksize=10000, remap=False):
    '''Iterator of DataFrames of length chunksize parsed from
    the results from CRBB version crb-blast 0.6.6.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrame with the CRBB hits.
    '''

    for group in pd.read_table(fn,
                               header=None,
                               names=[k for k, _ in crb_cols],
                               delimiter='\t',
                               chunksize=chunksize):

        convert_dtypes(group, dict(crb_cols))

        qrange = group.qrange.str.partition('..')
        group['qstart'] = qrange[0].astype(int)
        group['qend'] = qrange[2].astype(int)
        del group['qrange']
        srange = group.srange.str.partition('..')
        group['sstart'] = srange[0].astype(int)
        group['send'] = srange[2].astype(int)
        del group['srange']

        if remap:
            remap_blast(group)
        yield group
Esempio n. 3
0
def crb_to_df_iter(fn, chunksize=10000, remap=False):
    '''Iterator of DataFrames of length chunksize parsed from
    the results from CRBB version crb-blast 0.6.6.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrame with the CRBB hits.
    '''

    for group in pd.read_table(fn, header=None, names=[k for k, _ in crb_cols],
                                delimiter='\t', chunksize=chunksize):

        convert_dtypes(group, dict(crb_cols))

        qrange = group.qrange.str.partition('..')
        group['qstart'] = qrange[0].astype(int)
        group['qend'] = qrange[2].astype(int)
        del group['qrange']
        srange = group.srange.str.partition('..')
        group['sstart'] = srange[0].astype(int)
        group['send'] = srange[2].astype(int)
        del group['srange']


        if remap:
            remap_blast(group)
        yield group
Esempio n. 4
0
def crb_to_df_iter(fn, chunksize=10000, remap=True):
    '''Iterator of DataFrames of length chunksize parsed from
    the results from CRBB version crb-blast 0.6.6.

    crb-blast is given the same treatment as BLAST+, as that's what
    it uses under the hood.

    We convert to proper 0-based, half-open intervals.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrame with the CRBB hits.
    '''

    for group in pd.read_table(fn, header=None, names=[k for k, _ in crb_cols],
                                delimiter='\t', chunksize=chunksize):

        convert_dtypes(group, dict(crb_cols))

        qrange = group.qrange.str.partition('..')
        group['qstart'] = qrange[0].astype(int)
        group['qend'] = qrange[2].astype(int)
        del group['qrange']
        srange = group.srange.str.partition('..')
        group['sstart'] = srange[0].astype(int)
        group['send'] = srange[2].astype(int)
        del group['srange']

        if remap:
            remap_blast(group)
        yield group
Esempio n. 5
0
def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=False):
    '''Iterator of DataFrames of length chunksize parsed from an
    NCBI BLAST+ `-outfmt6` file.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrme with the BLAST+ hits.
    '''

    for group in pd.read_table(fn, header=None, skipinitialspace=True,
                                names=[k for k, _ in blast_cols],
                                delimiter=delimiter, chunksize=chunksize):
        convert_dtypes(group, dict(blast_cols))
        if remap:
            remap_blast(group)
        yield group
Esempio n. 6
0
def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=False):
    '''Iterator of DataFrames of length chunksize parsed from an
    NCBI BLAST+ `-outfmt6` file.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrme with the BLAST+ hits.
    '''

    for group in pd.read_table(fn,
                               header=None,
                               skipinitialspace=True,
                               names=[k for k, _ in blast_cols],
                               delimiter=delimiter,
                               chunksize=chunksize):
        convert_dtypes(group, dict(blast_cols))
        if remap:
            remap_blast(group)
        yield group
Esempio n. 7
0
def crb_to_df_iter(fn, chunksize=10000, remap=True):
    '''Iterator of DataFrames of length chunksize parsed from
    the results from CRBB version crb-blast 0.6.6.

    crb-blast is given the same treatment as BLAST+, as that's what
    it uses under the hood.

    We convert to proper 0-based, half-open intervals.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrame with the CRBB hits.
    '''

    for group in pd.read_table(fn,
                               header=None,
                               names=[k for k, _ in crb_cols],
                               delimiter='\t',
                               chunksize=chunksize):

        convert_dtypes(group, dict(crb_cols))

        qrange = group.qrange.str.partition('..')
        group['qstart'] = qrange[0].astype(int)
        group['qend'] = qrange[2].astype(int)
        del group['qrange']
        srange = group.srange.str.partition('..')
        group['sstart'] = srange[0].astype(int)
        group['send'] = srange[2].astype(int)
        del group['srange']

        if remap:
            remap_blast(group)
        yield group
Esempio n. 8
0
def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=True):
    '''Iterator of DataFrames of length chunksize parsed from an
    NCBI BLAST+ `-outfmt6` file.

    Native BLAST+ uses an interval of the form [start,end), start >= 1. This
    changes to [end,start) when on the negative strand, apparently solely
    to make other bioinformaticians suffer.

    We convert to proper 0-based, half-open intervals.

    Args:
        fn (str): The results file.
        chunksize (int): Hits per iteration.
    Yields:
        DataFrame: Pandas DataFrme with the BLAST+ hits.
    '''

    for group in pd.read_table(fn, header=None, skipinitialspace=True,
                                names=[k for k, _ in blast_cols],
                                delimiter=delimiter, chunksize=chunksize):
        convert_dtypes(group, dict(blast_cols))
        if remap:
            remap_blast(group)
        yield group