Esempio n. 1
0
def output(pas_array,scan_file,out,window,max_shift,species,prob,number_pas,rst):
    
    extend  = int(window/2)
    
    f = open(scan_file,'r')
    lines = f.readlines()
    
    
    ww = open(out,'w')

    pre_pos = 0
    negative_candidate = dict()
    for i,line in enumerate(lines):
        line = line.rstrip('\n')
        pas_id,rpm,base = line.split('\t')
        chromosome,pos,strand = pas_id.split(':')
        pos = int(pos)
        if(i-extend>0 and i+extend+1+max_shift<len(lines)):
            if(random.random()<prob):
                accept = 1
                if (abs(pos-pre_pos)<Threshold):
                    continue
                for true_pos in pas_array:
                    if(abs(pos-true_pos)<Threshold):
                        accept = 0
                if(accept==0):
                    continue 
                        
                pre_pos = pos
                negative_candidate[pas_id] = i
    count = 0
    items = list(negative_candidate.items())
    random.shuffle(items)
    for pas_id,i in items:
        start = i-extend
        end   = i+extend
        if(not check(lines[start-max_shift],lines[end+max_shift],window+2*max_shift)):
            continue
        success = collpase(pas_id,'unknown','unknown',lines[start:end+1],ww,species,0,rst)
        count += success
        if(success==0):
            continue
        for j in range(-max_shift,max_shift+1):
            if(j==0):
                continue
            k = i+j
            start = k-extend
            end   = k+extend
            if(start>0 and end+1<len(lines)):
                if(check(lines[start],lines[end],window)):
                    collpase(pas_id,'unknown','unknown',lines[start:end+1],ww,species,j,rst)
        if(count>=number_pas):
            break
    if(count<number_pas):
        raise Warning("not engough negative candidates, please incerase the probability for selecting!")
    else:
        print("successfully randomly get same number of negative pas as ground truth")
    f.close()
def output(pas_dict,scan_file,out,window,max_shift,species):
    
    extend  = int(window/2)
    
    f = open(scan_file,'r')
    lines = f.readlines()
    
    ww = open(out,'w')

    for i,line in enumerate(lines):
        line = line.rstrip('\n')
        pas_id,rpm,base = line.split('\t')
        chromosome,pos,strand = pas_id.split(':')
        pos = int(pos)
        if pos in pas_dict.keys():
            pas_type = pas_dict[pos]
            symbol   = 'unknown'
            if(i-extend>0 and i+extend+1<len(lines)):
                for j in range(-max_shift,max_shift+1):
                    k = i+j
                    start = k-extend
                    end   = k+extend
                    if(start>0 and end+1<len(lines)):
                        if(check(lines[start],lines[end],window)):
                            collpase(pas_id,pas_type,symbol,lines[start:end+1],ww,species,j)
    ww.close()
    f.close()
def dataProcessing(scan_file, window, rst):

    extend = int(window / 2)
    data1 = []
    data2 = []
    PASID = []
    alphabet = np.array(['A', 'T', 'C', 'G'])

    f = open(scan_file, 'r')
    lines = f.readlines()

    #n_pos = 0 #position containing N
    for i, line in enumerate(lines):
        line = line.rstrip('\n')
        pas_id, _, base = line.split('\t')

        if (base == 'N'):
            continue
        start = i - extend
        end = i + extend
        if (start > 0 and end + 1 < len(lines)):
            if (not check(lines[start], lines[end], window)):
                continue
            sequence, coverage = collpase(pas_id, lines[start:end + 1], rst)
            if (sequence != 0):
                chromosome, pos, strand = pas_id.split(':')
                sequence = list(sequence)
                seq = np.array(sequence, dtype='|U1').reshape(-1, 1)
                seq_data = (seq == alphabet).astype(np.float32)
                data1.append(seq_data)
                coverage = np.array(coverage).astype(np.float32)
                data2.append(coverage)
                PASID.append(pas_id)
    data1 = np.stack(data1).reshape([-1, window, 4])
    data2 = np.stack(data2).reshape([-1, window, 1])
    PASID = np.array(PASID)

    f.close()
    return data1, data2, PASID