Esempio n. 1
0
def main(args):

    import argparse, sys
    import numpy as np
    from scipy import sparse
    from peakachu import utils

    np.seterr(divide='ignore',invalid='ignore')
    
    check = utils.read_hic_header(args.path) # more robust to check if a file is .hic
    if check is None:
        hic = False
    else:
        hic = True

    totals=0
    if not hic:
        import cooler
        Lib = cooler.Cooler(args.path)
    
        for k in Lib.chromnames[:]:
            intra = np.triu(Lib.matrix(balance=False, sparse=False).fetch(k))
            totals += int(intra.sum())
    else:
        import straw
        hic_info = check
        for k in hic_info['chromsizes']:
            intra = straw.straw('NONE', args.path, k, k, 'BP', 10000)
            totals += sum(intra[2]) # intra is a list of list x, y, v

    print(totals)
Esempio n. 2
0
def main(args):

    import argparse
    import gc
    import pathlib
    from sklearn.externals import joblib
    import numpy as np
    from peakachu import scoreUtils, utils

    np.seterr(divide='ignore', invalid='ignore')

    pathlib.Path(args.output).mkdir(parents=True, exist_ok=True)

    model = joblib.load(args.model)

    # more robust to check if a file is .hic
    hic_info = utils.read_hic_header(args.path)
    if hic_info is None:
        hic = False
        import cooler
        Lib = cooler.Cooler(args.path)
        chromosomes = Lib.chromnames[:]
        #nam = args.path.split('.cool')[0]
    else:
        hic = True
        chromosomes = utils.get_hic_chromosomes(args.path, args.resolution)
        #nam = args.path.split('.hic')[0]
    #nam = nam.split('/')[-1]

    for key in chromosomes:
        if key.startswith('chr'):
            cname = key
        else:
            cname = 'chr'+key
        if not hic:
            X = scoreUtils.Chromosome(Lib.matrix(balance=args.balance, sparse=True).fetch(key).tocsr(),
                                      model=model,
                                      cname=cname, lower=args.lower,
                                      upper=args.upper, res=args.resolution,
                                      width=args.width)
        else:
            if args.balance:
                X = scoreUtils.Chromosome(utils.csr_contact_matrix('KR', args.path, key, key, 'BP', args.resolution),
                                          model=model,
                                          cname=cname, lower=args.lower,
                                          upper=args.upper, res=args.resolution,
                                          width=args.width)
            else:
                X = scoreUtils.Chromosome(utils.csr_contact_matrix('NONE', args.path, key, key, 'BP', args.resolution),
                                          model=model,
                                          cname=cname, lower=args.lower,
                                          upper=args.upper, res=args.resolution,
                                          width=args.width)

        result, R = X.score()
        X.writeBed(args.output, result, R)
Esempio n. 3
0
def main(args):

    import argparse
    import sys
    import numpy as np
    from scipy import sparse
    from peakachu import utils

    np.seterr(divide='ignore', invalid='ignore')

    # more robust to check if a file is .hic
    check = utils.read_hic_header(args.path)
    if check is None:
        hic = False
    else:
        hic = True

    totals = 0
    if not hic:
        import cooler
        Lib = cooler.Cooler(args.path)

        mindis = args.min_dis // Lib.binsize

        for k in Lib.chromnames[:]:
            intra = np.triu(Lib.matrix(balance=False, sparse=False).fetch(k),
                            k=mindis)
            totals += int(intra.sum())
    else:
        import straw
        hic_info = check
        for k in hic_info['chromsizes']:
            try:
                intra = straw.straw('NONE', args.path, k, k, 'BP', 10000)
                totals += sum(intra[2])  # intra is a list of list x, y, v
            except:
                pass  # handle the inconsistency between .hic header and the matrix

    print(totals)
Esempio n. 4
0
def main(args):

    import argparse, gc, pathlib, os
    import numpy as np
    from sklearn.externals import joblib
    from peakachu import scoreUtils, utils

    np.seterr(divide='ignore', invalid='ignore')

    pathlib.Path(args.output).mkdir(parents=True, exist_ok=True)

    model = joblib.load(args.model)

    hic_info = utils.read_hic_header(
        args.path)  # more robust to check if a file is .hic
    if hic_info is None:
        hic = False
    else:
        hic = True

    if not hic:
        import cooler
        Lib = cooler.Cooler(args.path)
        chromosomes = Lib.chromnames[:]
    else:
        chromosomes = list(hic_info['chromsizes'])

    pre = utils.find_chrom_pre(chromosomes)
    tmp = os.path.split(args.model)[1]  # support full path
    ccname = pre + tmp.split('.pk')[0].lstrip(
        'chr')  # ccname is consistent with chromosome labels in .hic / .cool
    cikada = 'chr' + ccname.lstrip('chr')  # cikada always has prefix "chr"

    if not hic:
        X = scoreUtils.Chromosome(Lib.matrix(
            balance=args.balance, sparse=True).fetch(ccname).tocsr(),
                                  model=model,
                                  cname=cikada,
                                  lower=args.lower,
                                  upper=args.upper,
                                  res=args.resolution,
                                  width=args.width)
    else:
        if args.balance:
            X = scoreUtils.Chromosome(utils.csr_contact_matrix(
                'KR', args.path, ccname, ccname, 'BP', args.resolution),
                                      model=model,
                                      cname=cikada,
                                      lower=args.lower,
                                      upper=args.upper,
                                      res=args.resolution,
                                      width=args.width)
        else:
            X = scoreUtils.Chromosome(utils.csr_contact_matrix(
                'NONE', args.path, ccname, ccname, 'BP', args.resolution),
                                      model=model,
                                      cname=cikada,
                                      lower=args.lower,
                                      upper=args.upper,
                                      res=args.resolution,
                                      width=args.width)
    result, R = X.score()
    X.writeBed(args.output, result, R)
Esempio n. 5
0
def main(args):

    from sklearn.externals import joblib
    import gc
    import pathlib
    import straw
    import numpy as np
    from peakachu import trainUtils, utils

    np.seterr(divide='ignore', invalid='ignore')

    pathlib.Path(args.output).mkdir(parents=True, exist_ok=True)

    # more robust to check if a file is .hic
    hic_info = utils.read_hic_header(args.path)

    if hic_info is None:
        hic = False
    else:
        hic = True

    coords = trainUtils.parsebed(args.bedpe, lower=2, res=args.resolution)
    kde, lower, long_start, long_end = trainUtils.learn_distri_kde(coords)

    if not hic:
        import cooler
        Lib = cooler.Cooler(args.path)
        chromosomes = Lib.chromnames[:]
    else:
        chromosomes = utils.get_hic_chromosomes(args.path, args.resolution)

    # train model per chromosome
    positive_class = {}
    negative_class = {}
    for key in chromosomes:
        if key.startswith('chr'):
            chromname = key
        else:
            chromname = 'chr' + key
        print('collecting from {}'.format(key))
        if not hic:
            X = Lib.matrix(balance=args.balance,
                           sparse=True).fetch(key).tocsr()
        else:
            if args.balance:
                X = utils.csr_contact_matrix('KR', args.path, key, key, 'BP',
                                             args.resolution)
            else:
                X = utils.csr_contact_matrix('NONE', args.path, key, key, 'BP',
                                             args.resolution)
        clist = coords[chromname]

        try:
            positive_class[chromname] = np.vstack(
                (f
                 for f in trainUtils.buildmatrix(X, clist, width=args.width)))
            neg_coords = trainUtils.negative_generating(
                X, kde, clist, lower, long_start, long_end)
            stop = len(clist)
            negative_class[chromname] = np.vstack(
                (f for f in trainUtils.buildmatrix(
                    X, neg_coords, width=args.width, positive=False, stop=stop)
                 ))
        except:
            print(chromname, ' failed to gather fts')

    for key in chromosomes:
        if key.startswith('chr'):
            chromname = key
        else:
            chromname = 'chr' + key

        Xtrain = np.vstack(
            (v for k, v in positive_class.items() if k != chromname))
        Xfake = np.vstack(
            (v for k, v in negative_class.items() if k != chromname))
        print(chromname, 'pos/neg: ', Xtrain.shape[0], Xfake.shape[0])
        model = trainUtils.trainRF(Xtrain, Xfake)

        joblib.dump(model,
                    args.output + '/' + chromname + '.pkl',
                    compress=('xz', 3))