def call_consensus(alignment, minproportion=0.5):
    pos_base = OrderedDict()
    record_size = 0
    for faObj in formFASTA(alignment):
        name, seq = faObj
        reads_count = 1
        record_size += reads_count
        index = 1
        for base in seq:
            index += 1
            if index not in pos_base.keys():
                pos_base[index] = defaultdict(int)
            if base != "-":
                pos_base[index][base] += reads_count

    con_seq = ""
    for pos, count in pos_base.items():
        total = float(sum(count.values()))
        if total / record_size >= minproportion:
            maxnum = 0
            maxbase = ""
            for base, num in count.items():
                if num > maxnum:
                    maxnum = num
                    maxbase = base
            con_seq += maxbase
    return con_seq, record_size
def group_reads(fa_file):
    BC_UMI_seq = defaultdict(lambda: defaultdict(list))
    for record in formFASTA(fa_file):
        name, seq = record
        BC = name.split("BC=")[1].split(" ")[0]
        UMI = name.split("UMI=")[1].split(" ")[0]
        BC_UMI_seq[BC][UMI].append(seq)
    return BC_UMI_seq
def getCCSreadDF(CCS_result_path):
    CCS_fa = open(CCS_result_path, 'r').read()
    readName = []
    readSeq = []
    for record in formFASTA(CCS_fa):
        name, seq = record
        readName.append(name)
        readSeq.append(seq)
    # build dataframe
    ccs_df = pd.DataFrame({"name": readName, "sequence": readSeq})
    ccs_df["name"] = ">" + ccs_df["name"]
    return ccs_df
 outSta = open(options.statistics, 'w')
 # write statistics header
 outSta.write("{}\t{}\t{}\t{}\n".format("zwID", "BC", "UMI", "pass_filter"))
 ## some input parameters
 P2 = options.FP
 P4 = options.RP
 inFP = options.inFP
 inRP = options.inRP
 polyAwin = options.pAw
 pAmismatch = options.pMs
 mismatch = options.mismatch
 ## get file handle
 if options.format == "FASTQ":
     file_handle = fqTOfa(inputFile)
 elif options.format == "FASTA":
     file_handle = formFASTA(inputFile.read())
 ## run filter
 pfq = multiprocessing.Pool(options.numcpu)
 for record in file_handle:
     header, raw_seq = record
     prf = pfq.apply_async(filterCCS,
                           args=(
                               header,
                               raw_seq,
                               P2,
                               P4,
                               inFP,
                               inRP,
                               polyAwin,
                               pAmismatch,
                               mismatch,
Esempio n. 5
0
def main():
    options = Parsers()

    alignFa = open(options.align, 'r').read()
    targetPos = open(options.targetpos, 'r')
    ### output file
    output = open(os.path.join(options.dir, options.name + ".Events.txt"), 'w')
    # editfre = open(os.path.join(options.dir, options.name + ".EditFrequency.txt"), 'w')
    # eventcount = open(os.path.join(options.dir, options.name + ".EventCounts.txt"), 'w')
    # eventcountDic = open(os.path.join(options.dir, options.name + ".EventCounts.pkl"), "wb")

    next(targetPos)
    tarPos = []
    header = []
    ### parser target positions
    for eachline in targetPos:
        sple = eachline.strip().split('\t')
        header.append(sple[0])
        tarPos.append((int(sple[1]), int(sple[2])))
    ### build a dict to count the edit frequency in every base
    # pos = list(range(1, options.len))
    # Edit_fre = {}
    # for p in pos:
    #     Edit_fre[p] = [0,0]
    ### write the output header
    #editfre.write("{}\t{}\t{}\t{}\t{}\n".format("Position", "Insertion", "InserPercent", "Deletion", "DeletPercent"))
    output.write("{}\t{}\t{}\n".format("BC", "UMI", '\t'.join(header)))
    #editfre.flush()
    output.flush()
    ### get the output data
    processPool = multiprocessing.Pool(options.cpu)
    results = []
    all_seq = formFASTA(alignFa)
    while True:
        try:
            _, refseq = next(all_seq)
            readname, readseq = next(all_seq)
            editresult = processPool.apply_async(analyzeEdit,
                                                 args=(
                                                     readname,
                                                     refseq,
                                                     readseq,
                                                 ))
            results.append(editresult)
        except StopIteration:
            break
    print('waite')
    processPool.close()
    processPool.join()
    print('done')
    editEventDict = collections.OrderedDict()
    for each_result in results:
        rawEditlist = each_result.get()
        editEventDict[rawEditlist[-1]] = rawEditlist[:-1]
    #EventCounts = defaultdict(int)
    #total = 0
    for key, value in editEventDict.items():
        #frequency = int(key.strip().split('_')[-1])
        BC = key.split("BC=")[1].split(" ")[0]
        UMI = key.split("UMI=")[1].split(" ")[0]
        stringline = outputEdit(value, tarPos)
        ### write out edit events
        output.write("{}\t{}\t{}\n".format(BC, UMI, stringline))
        #total += 1
        # for align in value:
        #     EventCounts[align.toEditString()] += 1
        #     if align.indicator == "D":
        #         for i in range(align.refpos, align.refpos + len(align.readbase)):
        #             Edit_fre[i][1] += 1
        #     elif align.indicator == "I":
        #         Edit_fre[align.refpos][0] += 1
    # ### write out the edit frequency
    # for k, v in Edit_fre.items():
    #     editfre.write("{}\t{}\t{}\t{}\t{}\n".format(k, v[0], v[0]/total, v[1], v[1]/total))
    ### write out the Edit events counts
    # eventcount.write("{}\t{}\n".format("Editevent", "counts"))
    # clean_EventCounts = {}
    # for eve, num in EventCounts.items():
    #     if "M" not in eve:
    #         clean_EventCounts[eve] = num
    #         eventcount.write("{}\t{}\n".format(eve, num))
    # pickle.dump(clean_EventCounts,eventcountDic)

    #editfre.close()
    output.close()