Ejemplo n.º 1
0
 def run(self):
     self.row_labels = Reader(self.query_path).get_headers()
     query_counts = self.make_query_counts()
     target_reader = Reader(self.target_path)
     target_headers_seqs = target_reader.get_data(tuples_only=True)
     r_values = []
     for target_header, target in my_tqdm()(target_headers_seqs):
         r_values.append(
             self.compare_query_target(query_counts, target_header, target))
     self.r_values2df(r_values)
     self.percentiles = self.calc_percentiles(query_counts)
     self.save()
Ejemplo n.º 2
0
    def save(self, names=None):
        """Saves the counts appropriately based on current settings.

        There are four output methods for the counts:
        1. Binary. This saves just the counts as a binary numpy array.
        2. No labels. Saves in plain text, but without any labels.
        3. Default names. If no names are provided, fasta headers will be used as labels.
        4. Custom names. Provide a list of names if you want to label lncRNAs with your own names.

        Parameters
        ----------
        names : [str] (default=None)
            Unique names for rows of the Dataframe.
        """
        err_msg = 'You cannot label a binary file. Set only one of "binary" or "label" as True.'
        assert not (self.binary and self.label), err_msg
        assert self.outfile is not None, 'Please provide an outfile location.'
        if self.binary:
            np.save(self.outfile, self.counts)
        elif self.label:
            if names is None:
                names = Reader(self.infasta).get_headers()
            df = DataFrame(data=self.counts, index=names, columns=self.kmers)
            df.to_csv(self.outfile)
        else:
            np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f')
Ejemplo n.º 3
0
    def __init__(
        self,
        infasta=None,
        outfile=None,
        k=6,
        binary=True,
        mean=True,
        std=True,
        log2=Log2.post,
        leave=True,
        silent=False,
        label=False,
        alphabet="AGTC",
    ):
        self.infasta = infasta
        self.seqs = None
        if infasta is not None:
            self.seqs = Reader(infasta).get_seqs()
        self.outfile = outfile
        self.k = k
        self.binary = binary
        self.mean = mean
        if isinstance(mean, str):
            self.mean = np.load(mean)
        self.std = std
        if isinstance(std, str):
            self.std = np.load(std)
        self.log2 = log2
        self.leave = leave
        self.silent = silent
        self.label = label
        self.counts = None
        self.alpha_len = len(alphabet)
        self.kmers = ["".join(i) for i in product(alphabet, repeat=k)]
        self.map = {k: i for k, i in zip(self.kmers, range(self.alpha_len**k))}

        if self.seqs is not None:
            if len(self.seqs) == 1 and self.std is True:
                err = ("You cannot standardize a single sequence. "
                       "Please pass the path to an std. dev. array, "
                       "or use raw counts by setting std=False.")
                raise ValueError(err)

        if not isinstance(self.log2, Log2):
            raise TypeError(f"log2 must be one of {list(Log2)}")
Ejemplo n.º 4
0
    def __init__(self,
                 infasta=None,
                 outfile=None,
                 k=6,
                 binary=True,
                 mean=True,
                 std=True,
                 log2=True,
                 leave=True,
                 silent=False,
                 label=False):
        self.infasta = infasta
        self.seqs = None
        if infasta is not None:
            self.seqs = Reader(infasta).get_seqs()
        self.outfile = outfile
        self.k = k
        self.binary = binary
        self.mean = mean
        if isinstance(mean, str):
            self.mean = np.load(mean)
        self.std = std
        if isinstance(std, str):
            self.std = np.load(std)
        self.log2 = log2
        self.leave = leave
        self.silent = silent
        self.label = label

        self.counts = None
        self.kmers = [''.join(i) for i in product('AGTC', repeat=k)]
        self.map = {k: i for k, i in zip(self.kmers, range(4**k))}

        if self.seqs is not None:
            if len(self.seqs) == 1 and self.std is True:
                err = ('You cannot standardize a single sequence. '
                       'Please pass the path to an std. dev. array, '
                       'or use raw counts by setting std=False.')
                raise ValueError(err)
Ejemplo n.º 5
0
                    default=multiprocessing.cpu_count() - 1)
parser.add_argument('-w', type=int, help='Window for tile size', default=1000)
parser.add_argument('-s',
                    type=int,
                    help='How many bp to slide tiles',
                    default=100)

args = parser.parse_args()

###########################################################################

###########################################################################
#Path to known functional domains

target_path = args.t
target_head, target_seq = Reader(target_path).get_headers(), Reader(
    target_path).get_seqs()
target_dict = dict(zip(target_head, target_seq))

BCD = np.load('./mamBCD4.mkv.npy')
AE = np.load('./mamAE4.mkv.npy')
###########################################################################

###########################################################################

###########################################################################
'''
    Parallelize transcript computations
    '''
###########################################################################
with multiprocessing.Pool(args.n) as pool:
Ejemplo n.º 6
0
alphabet = [letter for letter in args.a]
model = args.model
if not model.endswith('/'):
    model +='/'

kDir = model+f'{args.k}'+'/'
modelName = model.split('/')[-2]
# Check if file exists and open if so, else skip this iteration of the loop

hmm = pickle.load(open(kDir+'hmm.mkv','rb'))


# Explicitly determine k from the size of the log matrix and the size of the alphabet used to generate it
k = int(log(len(hmm['E']['+'].keys()),len(args.a)))
kmers = [''.join(p) for p in product(alphabet,repeat=k)] # generate k-mers
target = Reader(args.db)
targetSeqs,targetHeaders = target.get_seqs(),target.get_headers()
targetMap = defaultdict(list)


#Pool processes onto number of CPU cores specified by the user
with pool.Pool(args.n) as multiN:
    jobs = multiN.starmap(hmmCalc,product(*[list(zip(targetHeaders,targetSeqs))]))
    dataDict = dict(jobs)
#Check if no hits were found
# if not all(v == None for v in dataDict.values()):

dataFrames = pd.concat([df for df in dataDict.values() if not None])
dataFrames['Length'] = dataFrames['End'] - dataFrames['Start']
dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName','Sequence']]
if not args.fasta:
Ejemplo n.º 7
0
    flatCoords = list(flatCoords)
    finalCoords = []
    for k, g in groupby(enumerate(flatCoords),lambda kv:kv[0]-kv[1]):
        finalCoords.append(list(map(itemgetter(1), g)))
    return finalCoords

def nsmall(a, n):
    return np.partition(a, n)[n]


k = 4

BCD = np.load('./mamBCD4.mkv.npy')
AE = np.load('./mamAE4.mkv.npy')

mm10Genes = Reader('gencodeGenesMm10.fa')
mm10Seq = mm10Genes.get_seqs()
mm10Head = mm10Genes.get_headers()
w=200
s=20
seqMap = defaultdict(list)

initModelMap = {'BCD':BCD,'AE':AE}
for head,seq in tqdm(zip(mm10Head,mm10Seq),total=len(mm10Seq)):
    tiles = [seq[i:i+w] for i in range(0,len(seq)-w+1,s)]
    mapMm10BCD = np.array([classify(tile,k,initModelMap['BCD']) for tile in tiles])
    mapMm10AE = np.array([classify(tile,k,initModelMap['AE']) for tile in tiles])
    whereHitBCD = np.array(np.nonzero(mapMm10BCD < 0))
    whereHitAE = np.array(np.nonzero(mapMm10AE < 0))
    whereHit = np.unique(np.concatenate((whereHitBCD,whereHitAE),axis=None))
    if whereHit.size > 0:
Ejemplo n.º 8
0
parser.add_argument('-n', type=int, help='Number of processors,default = number cpus avail',
                    default=multiprocessing.cpu_count()-1)
parser.add_argument('-w', type=int, help='Window for tile size', default=1000)
parser.add_argument(
    '-s', type=int, help='How many bp to slide tiles', default=100)
args = parser.parse_args()

kmers = [''.join(p) for p in product('AGTC',repeat=args.k)]
kmer_map = dict(zip(kmers,range(0,4**args.k)))
###########################################################################

###########################################################################
#Path to known functional domains
query_path = './queries/queries.fa'
target_path = args.t
target_head, target_seq = Reader(
    target_path).get_headers(), Reader(target_path).get_seqs()
target_dict = dict(zip(target_head, target_seq))

queries = dict(zip(Reader(query_path).get_headers(),
                   Reader(query_path).get_seqs()))
###########################################################################

###########################################################################
mean_paths = [f for f in glob.iglob('./stats/*mean.npy')]
std_paths = [f for f in glob.iglob('./stats/*std.npy')]

means = {}
for mean_path in mean_paths:
    means[basename(mean_path)] = np.load(mean_path)
stds = {}
for std_path in std_paths:
Ejemplo n.º 9
0
 def __init__(self, infasta=None, outfasta=None, outnames=None):
     self.infasta = infasta
     if infasta is not None:
         self.data, self.names, self.seqs = Reader(infasta).get_data()
     self.outfasta = outfasta
     self.outnames = outnames
Ejemplo n.º 10
0
hits = pickle.load(open(args.hits,'rb'))

df = pd.DataFrame.from_dict(hits,orient='index')
query_dict = defaultdict(list)
for row in df.iterrows():
    query_dict = defaultdict(list)
    name = row[0]
    data = row[1:][0][0]
    for i in data:
        query_dict[i[1]].append(i[0])
    hits[name] = query_dict.copy()
    query_dict.clear()

parsedDf = pd.DataFrame.from_dict(hits,orient='index')

queryFasta = Reader('./queries.fa')
queryHeaders = queryFasta.get_headers()

queryMap = dict(zip(list(range(13)),queryHeaders))
parsedDf.rename(columns=queryMap,inplace=True)
queryCons = {}

for query in parsedDf:
    seekrGenomicCoords = dSeekrCoords(parsedDf,args.w,args.s)
    with multiprocessing.Pool(args.n) as pool:
        ha = pool.starmap(getCons, product(
            *[[list(seekrGenomicCoords.items())],[mouseGTF]))
        merge = dict(ha)
    queryCons[query] = merge
    print(queryCons)
    1/0
Ejemplo n.º 11
0
    '-n',
    type=int,
    help=
    'Number of CPU cores. Each job corresponds to a value of k, and the program scales well with multiprocessing',
    default=1)

args = parser.parse_args()

if __name__ == '__main__':

    # Read in specified values of k, and the alphabet
    kVals = [int(i) for i in args.k.split(',')]
    a = args.a.upper()

    #SEEKR fasta reader module
    F = Reader(args.fasta)
    fS = F.get_seqs()

    #Join sequences together using $ delimiter character
    fString = '$'.join(fS).upper()
    lenFString = sum([len(i) for i in fS])

    # Need to figure out how to deal with very long fasta files (~ 2-3X the size of the transcriptome in mice)
    # if lenFString >= 2147483647:
    #     fString='$'.join(fS[::10]).upper()

    #Split jobs onto processors and call kmers.pyx cython file
    with pool.Pool(args.n) as multiN:
        jobs = multiN.starmap(kmers.main, product(*[[fString], kVals, [a]]))
        dataDict = dict(jobs)
Ejemplo n.º 12
0
            for j, cj in enumerate('ATCG'):

                conds[i, j] = kmers[ci + cj] / float(tot)


return conds

# Null Model
genomeRawCounts = BasicCounter(
    '/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa',
    k=k,
    mean=False,
    std=False,
    log2=False,
    alphabet='ATCG')
genomeFa = Reader('/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa')
genomeSeqs = genomeFa.get_seqs()
genomeSeqLens = [len(i) for i in genomeSeqs]
genomeRawCounts.get_counts()
unNormGenomeCounts = genomeRawCounts.counts.T * genomeSeqLens / 1000
genomeCounts = np.rint(unNormGenomeCounts.T)
weightedAvgGenomeCounts = np.average(genomeCounts,
                                     weights=genomeSeqLens,
                                     axis=0)

kmers = [''.join(p) for p in itertools.product('ATCG', repeat=k)]
curr_kmers = dict(zip(kmers, weightedAvgGenomeCounts))
genome_avg = markov_chain(curr_kmers, k, 'ATCG')

np.save(f'./genome{k}.mkv.npy', genome_avg)
Ejemplo n.º 13
0
parser = argparse.ArgumentParser()
parser.add_argument("-k",type=int)
parser.add_argument('--db',type=str,help='Path to fasta file containing training sequences')
parser.add_argument('--prior',type=str,help='Path to binary .mkv file output from train.py (e.g. markovModels/D_null/2/hmm.mkv')
parser.add_argument('-cf','--createfile',action='store_true',help='Create new file rather than overwrite')
parser.add_argument('--its',type=int,help='Iterations to do, default=100',default=20)

args = parser.parse_args()

assert args.its > 0, 'Please provide an integer greater than or equal to 1'
assert args.k > 0, 'Please provide an integer greater than or equal to 1'



fa = Reader(args.db)
seqs = '$'.join(fa.get_seqs())
model = args.prior

k = args.k

# Identify the location of any ambiguous nucleotides (N)
O,oIdx,nBP = corefunctions.kmersWithAmbigIndex(seqs,k)
# Load in train.py output 
hmm = pickle.load(open(args.prior,'rb'))

'''
A - transition matrix (dictionary)
E - emission matrix (dictionary)
pi - initial state probabilities (always 50/50)
states - list of states (query,null)