Beispiel #1
0
def set_output(outfile):
    """
    :param outfile:
    :return:
    """
    if outfile == 'stdout':
        return sys.stdout
    else:
        opn, mode = text_file_mode(outfile, read=False)
        return opn(outfile, mode)
Beispiel #2
0
def get_superblock_positions(mapfile, tselect=None, qselect=None):
    """
    :param mapfile:
    :param tselect:
    :param qselect:
    :return:
    """
    tskip = fnt.partial(_check_skip, *(tselect,))
    qskip = fnt.partial(_check_skip, *(qselect,))
    opn, mode = text_file_mode(mapfile)
    # the index is built to speed up the mapping from
    # target to query, but, technically, the index is
    # reciprocally valid (directionality just does not matter here)
    readmap = fnt.partial(_read_map_line, *('target', ))
    last_supblock = None, None
    skip = False
    map_positions = col.defaultdict(dict)
    with opn(mapfile, mode) as maps:
        while 1:
            line = maps.readline()
            llen = len(line)
            if not line:
                # this is EOF at last
                break
            elif not line.strip():
                # just empty line
                continue
            else:
                parts = readmap(line)
                tchrom, qchrom = parts.chrom, parts.match
                if (tchrom, qchrom) != last_supblock:
                    skip = tskip(tchrom) or qskip(qchrom)
                    last_supblock = tchrom, qchrom
                    if skip:
                        continue
                    # record start position of new super block
                    pos = maps.tell() - llen
                    try:
                        # Why saved in this order?
                        # For the mapping, a fixed number of query
                        # chromosomes is allocated, say 3, to keep
                        # memory footprint low. Then, the worker
                        # processes load and map in parallel from all
                        # target chromosomes to the few query chromosomes.
                        # It follows that one needs to know:
                        # For a given query chromosome Q, where are the blocks
                        # in the mapping from any target chromosome to this
                        # particular query chromosome?
                        map_positions[qchrom][tchrom].append(pos)
                    except KeyError:
                        map_positions[qchrom][tchrom] = [pos]
    assert map_positions, 'No map positions extracted from file {} - building index failed'.format(mapfile)
    return map_positions
Beispiel #3
0
def process_chains(params):
    """
    :param params:
    :return:
    """
    fpath = params['inputfile']
    chrom = params['chrom']
    re_chrom = re.compile(chrom + '$')
    csize = params['size']
    qchroms = re.compile(params['qcheck'])
    opn, mode = text_file_mode(fpath)
    with opn(fpath, mode=mode, encoding='ascii') as infile:
        chainit = get_chain_iterator(infile, tselect=re_chrom, qselect=qchroms)
        mask, splits, select = build_index_structures(chainit, csize)
    return chrom, mask, splits, select
Beispiel #4
0
def build_full_chain_index(chainfile, tselect=None, qselect=None):
    """
    :param chainfile:
    :param tselect:
    :param qselect:
    :return:
    """
    if tselect is None:
        tselect = re.compile('.+')
    if qselect is None:
        qselect = re.compile('.+')
    opn, mode = text_file_mode(chainfile)
    newline = True
    pos = 0
    tchrom_sizes = dict()
    qchrom_sizes = dict()
    tchrom_index = col.defaultdict(list)
    qchrom_index = col.defaultdict(list)
    with opn(chainfile, mode) as chains:
        while 1:
            line = chains.readline()
            if not line:
                break
            elif line.startswith('chain'):
                assert newline, 'No newline detected before new chain: {} (last pos. {})'.format(line, pos)
                tName, tSize, qName, qSize, chainid = _read_chain_header_index(line)
                if tselect.match(tName) is not None and qselect.match(qName) is not None:
                    tchrom_sizes[tName] = tSize
                    qchrom_sizes[qName] = qSize
                    tchrom_index[tName].append((pos, chainid))
                    qchrom_index[qName].append((pos, chainid))
            elif not line.strip():
                # newline in file
                newline = True
                pos = chains.tell()
            else:
                newline = False
                continue
    target_struct = {'sizes': tchrom_sizes, 'index': tchrom_index}
    query_struct = {'sizes': qchrom_sizes, 'index': qchrom_index}
    return target_struct, query_struct
Beispiel #5
0
def get_chain_positions(chainfile, tselect=None, qselect=None, tref=True):
    """
    :param chainfile:
    :param tselect:
    :param qselect:
    :param tref:
    :return:
    """
    if tselect is None:
        tselect = re.compile('.+')
    if qselect is None:
        qselect = re.compile('.+')
    opn, mode = text_file_mode(chainfile)
    newline = True
    pos = 0
    chain_positions = col.defaultdict(dict)
    with opn(chainfile, mode) as chains:
        while 1:
            line = chains.readline()
            if not line:
                break
            elif line.startswith('chain'):
                assert newline, 'No newline detected before new chain: {} (last pos. {})'.format(line, pos)
                parts = _read_chain_header(line)
                if tselect.match(parts[0]) is not None and qselect.match(parts[5]) is not None:
                    if tref:
                        tchrom, qchrom = parts[0], parts[5]
                    else:
                        tchrom, qchrom = parts[5], parts[0]
                    try:
                        chain_positions[tchrom][qchrom].append(pos)
                    except KeyError:
                        chain_positions[tchrom][qchrom] = [pos]
            elif not line.strip():
                # newline in file
                newline = True
                pos = chains.tell()
            else:
                newline = False
                continue
    return chain_positions
Beispiel #6
0
def chromsize_from_chain(chainfile, chrom, target=True):
    """
    :param chainfile:
    :param chrom:
    :return:
    """
    read_head = _read_chain_header
    opn, mode = text_file_mode(chainfile)
    chrom_size = 0
    with opn(chainfile, mode=mode, encoding='ascii') as chf:
        for line in chf:
            if line.strip() and line.startswith('chain'):
                parts = read_head(line)
                if parts[0] == chrom and target:
                    chrom_size = parts[1]
                    break
                elif parts[5] == chrom:
                    chrom_size = parts[6]
                    break
    assert chrom_size > 0, 'No entry in chain file {} for chromosome: {}'.format(chainfile, chrom)
    return chrom_size
Beispiel #7
0
def read_chromosome_sizes(fpath, keep='\w+'):
    """
    :param fpath:
    :param keep:
    :return:
    """
    chroms = dict()
    keeper = re.compile(keep)
    opn, mode = text_file_mode(fpath)
    with opn(fpath, mode=mode, encoding='ascii') as infile:
        for line in infile:
            if not line.strip():
                continue
            cols = line.strip().split()
            cname = cols[0].strip()
            m = keeper.match(cname)
            if m is not None:
                csize = int(cols[1])
                chroms[cname] = csize
    assert chroms, 'No chromosomes from file {} selected with pattern {}'.format(fpath, keep)
    return chroms
Beispiel #8
0
def build_conservation_mask(chainfile, chrom, csize=None):
    """ Build a mask that indicates
    1: is not conserved (= is masked)
    0: is conserved (= is not masked)
    :param chainfile:
    :param chrom:
    :param csize:
    :return:
    """
    if csize is not None:
        mask = np.ones(csize, dtype=np.bool)
    else:
        chromsize = chromsize_from_chain(chainfile, chrom)
        mask = np.ones(chromsize, dtype=np.bool)
    opn, mode = text_file_mode(chainfile)
    num_aln = 0
    with opn(chainfile, mode=mode, encoding='ascii') as cf:
        chainit = get_chain_iterator(cf, tselect=chrom)
        for aln in chainit:
            mask[aln[1]:aln[2]] = 0
            num_aln += 1
    return mask, num_aln
Beispiel #9
0
def process_signal(params):
    """
    :param params:
    :return:
    """
    all_data = tuple()
    chrom = params['chrom']
    for fp in params['inputfiles']:
        opn, mode = text_file_mode(fp)
        values = np.zeros(params['size'], dtype=np.float64)
        with opn(fp, mode=mode, encoding='ascii') as infile:
            it = itt.dropwhile(lambda x: x.split()[0] != chrom, infile)
            for line in it:
                c, s, e, v = line.split()
                if c != chrom:
                    break
                values[int(s):int(e)] = float(v)
        if params['clip'] < 100. and np.count_nonzero(values) > 0:
            new_max = stats.scoreatpercentile(values, params['clip'])
            values = np.clip(values, 0., new_max)
        all_data += values,
    if len(all_data) > 1 and not params['noqnorm']:
        retvals = merge_1d_datasets(*all_data,
                                    mergestat=params['mergestat'],
                                    qnorm=True)
    elif len(all_data) > 1 and params['noqnorm']:  # being explicit...
        retvals = merge_1d_datasets(*all_data,
                                    mergestat=params['mergestat'],
                                    qnorm=False)
    else:
        retvals = all_data[0]
    if params['decranks'] and np.count_nonzero(values) > 0:
        retvals = transform_to_dec_ranks(retvals)
    if np.count_nonzero(values) == 0:
        retvals = None
    return chrom, retvals
Beispiel #10
0
def determine_text_table_type(filepath, useheader, logger=None):
    """
    :param filepath:
    :param useheader:
    :param logger:
    :return:
    """
    opn, mode = text_file_mode(filepath)
    fieldnames = []
    skip = 0
    read_chars = 0
    with opn(filepath, mode=mode, encoding='ascii') as text:
        # heuristic to determine the chunksize to be read
        # from the file to surely include a potential header
        # and a full data line
        # Note to self: I always (?) open files in text mode,
        # so len() is fine (number of characters)
        read_chars += len(text.readline())
        read_chars += len(text.readline())
        assert read_chars > 0, 'No lines read from file {} - it appears to be empty'.format(filepath)
        text.seek(0)
        sniffer = csv.Sniffer()
        text.seek(0)
        dialect = sniffer.sniff(text.read(read_chars), delimiters=VALID_DELIMITERS)
        if dialect.delimiter == ' ' and logger is not None:
            logger.warning('Detected {} as delimiter for file {} - this is not ideal'
                           ' and potentially error-prone. Processing will proceed but if you encounter'
                           ' strange values in your (textual) data, it is highly recommended to reformat'
                           ' your files to be {} or {} separated'
                           ' and to restart the whole process.'.format(DELIMITER_NAMES[' '],
                                                                       filepath,
                                                                       DELIMITER_NAMES['\t'],
                                                                       DELIMITER_NAMES[',']))
        else:
            if logger is not None:
                logger.debug('Detected {} as delimiter in file {}'.format(DELIMITER_NAMES.get(dialect.delimiter, dialect.delimiter), os.path.basename(filepath)))
        text.seek(0)
        header = sniffer.has_header(text.read(read_chars))
        if header and not useheader:
            skip = 1
            text.seek(0)
            assumed_header = text.readline()
            if logger is not None:
                logger.debug('Skipping line {} from file {} since'
                             ' "use header" is set to FALSE'.format(assumed_header, os.path.basename(filepath)))
        elif header and useheader:
            # perfect situation
            text.seek(0)
            fieldnames = get_text_table_header(text.readline(), dialect.delimiter)
            if logger is not None:
                logger.debug('Identified header fields: {}'.format(fieldnames))
        elif not header and useheader:
            text.seek(0)
            assumed_header = text.readline()
            if logger is not None:
                logger.warning('csv.Sniffer could not identify a header in file {},'
                               ' but "use header" is TRUE. Trying to extract column'
                               ' names from line {}'.format(os.path.basename(filepath), assumed_header))
            fieldnames = get_text_table_header(assumed_header, dialect.delimiter)
            garbage = check_header_garbage(fieldnames)
            if garbage and logger is not None:
                logger.warning('The following field names in the header seem uncommon'
                               ' or their names have been chosen poorly: {} -'
                               ' Are you sure this file has a header?'.format('[ ' + ' | '.join(garbage) + ' ]'))
        elif not header and not useheader:
            if logger is not None:
                logger.debug('No header detected or forced - ok')
            # fieldnames will be empty, defaults to chrom - start - end
            pass
        else:
            raise AssertionError('How did I end up here?! We need more unit tests...')
    return skip, dialect.delimiter, fieldnames
Beispiel #11
0
def process_regions(params):
    """
    :param params:
    :return:
    """
    fpath = params['inputfile']
    chr_match = re.compile(params['selectchroms'])
    score_col_idx = params['scoreidx']
    if score_col_idx != -1:
        score_col_name = params['colnames'][score_col_idx]
        datatypes = {
            'start': np.int32,
            'end': np.int32,
            score_col_name: np.float64
        }
    else:
        datatypes = {'start': np.int32, 'end': np.int32}

    opn, mode = text_file_mode(fpath)
    with opn(fpath, mode=mode, encoding='ascii') as infile:
        if params['useheader']:
            # in Pandas docs:
            # "Explicitly pass header=0 to be able to replace existing names"
            regions = pd.read_csv(infile,
                                  sep=params['delimiter'],
                                  names=params['colnames'],
                                  index_col=False,
                                  dtype=datatypes,
                                  header=0,
                                  skipinitialspace=True,
                                  skiprows=params['skip'],
                                  skip_blank_lines=True,
                                  encoding='utf-8',
                                  comment=None,
                                  usecols=params['colnames'],
                                  low_memory=False)
        else:
            regions = pd.read_csv(infile,
                                  sep=params['delimiter'],
                                  names=params['colnames'],
                                  index_col=False,
                                  dtype=datatypes,
                                  header=None,
                                  skipinitialspace=True,
                                  skiprows=params['skip'],
                                  skip_blank_lines=True,
                                  encoding='utf-8',
                                  comment=None,
                                  usecols=params['colnames'],
                                  low_memory=False)
    chroms_in_file = regions.chrom.drop_duplicates().tolist()
    remove_chroms = set(
        filter(lambda x: chr_match.match(x) is None, chroms_in_file))
    drop_columns = ['filter_for_chrom']
    regions = regions.assign(
        filter_for_chrom=lambda x: x.chrom.isin(remove_chroms))
    regions.drop(regions.index[regions.filter_for_chrom],
                 inplace=True,
                 axis='index')
    if params['filtersize'] > 0:
        drop_columns.append('filter_for_length')
        regions = regions.assign(filter_for_length=lambda x: x.end - x.start)
        regions.drop(
            regions[regions.filter_for_length < params['filtersize']].index,
            inplace=True,
            axis='index')
    if score_col_idx != -1 and params['keeptop'] < 100:
        drop_columns.append('filter_for_score')
        # heuristic to check if score column seems to be reasonable
        assert regions[score_col_name].var() > 0, \
            'Scores have 0 variance in file {} for selected column {}'.format(fpath, score_col_idx)
        lower_threshold = stats.scoreatpercentile(
            regions[score_col_name].values, 100 - params['keeptop'])
        regions = regions.assign(
            filter_for_score=lambda x: x[score_col_name] < lower_threshold)
        regions.drop(regions.index[regions.filter_for_score],
                     inplace=True,
                     axis='index')
    if not params['useheader']:
        for col in regions.columns:
            if col in ['chrom', 'start', 'end', 'name']:
                continue
            drop_columns.append(col)
    regions.drop(drop_columns, axis='columns', inplace=True)
    reordered_columns = reorder_columns(regions.columns.tolist())
    regions = regions[reordered_columns]
    regions.sort_values(['chrom', 'start', 'end'], axis='index', inplace=True)
    regions.index = np.arange(regions.shape[0])
    assert not regions.empty, 'No regions read from file {} (or are left after filtering)'.format(
        fpath)
    return regions, set(regions.chrom.drop_duplicates().tolist())