def ExtractFromTopconsSingleAllinfo(recordContent, seqid2AnnoDict):  #{{{
    record = {}
    lines = recordContent.split("\n")
    lines = filter(None, lines)
    numLine = len(lines)
    if numLine < 1:
        return {}
    i = 0
    anno = ""
    otherLineList = []
    for line in lines:
        if line[0:4] == 'NAME':
            anno += line[9:]
        else:
            otherLineList.append(line)

    seqid = myfunc.GetSeqIDFromAnnotation(anno)
    if seqid.find("UniRef") == 0:
        seqid = seqid.split("_")[1]
    if seqid in seqid2AnnoDict:
        rd = {}
        rd['anno'] = seqid2AnnoDict[seqid]
        rd['otherLineList'] = otherLineList
        return rd
    else:
        return {}
def GetDatabaseIDList(annoList):#{{{
    idList = [] 
    for anno in annoList:
        if anno == "" or anno[0] == "#":
            continue
        firstword = myfunc.GetFirstWord(anno)
        lengthword = len(firstword)
        p1 = firstword.find('(')
        if p1 == -1: 
            p1 = lengthword
        p2 = firstword.find('/')
        if p2 == -1: 
            p2 = lengthword

        firstword = firstword[:min(p1,p2)]
        if firstword.find("target") != -1:
            pass
        else:
            seqid = myfunc.GetSeqIDFromAnnotation(firstword)
            idList.append(seqid)

    #print len(myfunc.uniquelist(idList))
    #print len(set(idList))

    idList = myfunc.uniquelist(idList)
    return idList
def Phobius2Fasta(infile, outpath):  #{{{
    try:
        rootname = os.path.basename(os.path.splitext(infile)[0])
        outfile = outpath + os.sep + rootname + "_PHOBIUS.topo"
        fpout = open(outfile, "w")
        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = Read_phobius_result_from_buffer(
                buff, recordList, isEOFreached)
            if len(recordList) > 0:
                for record in recordList:
                    seqid = myfunc.GetSeqIDFromAnnotation(record['anno'])

                    if record['predtopo'] != "":
                        fpout.write(">%s\n" % (record['anno']))
                        fpout.write("%s\n" % record['predtopo'])
            if isEOFreached == True:
                break
        fpin.close()
        fpout.close()
        print "Results have been output to"
        print "\t%s" % outfile
    except IOError:
        print >> sys.stderr, "Failed to open file %s for read" % (infile)
        raise
Beispiel #4
0
def ReadTopoWithDGScore(inFile):  #{{{
    # return topoAlnWithDGScoreList indexID2
    # indexID2 is a hash table to quickly locate the index of
    # topoWithDGScoreList given the id2
    topoWithDGScoreList = []
    indexID = {}
    fpin = open(inFile, "r")
    lines = fpin.readlines()
    fpin.close()
    i = 0
    cntAlign = 0
    while i < len(lines):
        line = lines[i]
        if line[0] == '>':
            topoWithDGScoreList.append({})
            annoLine = lines[i].lstrip(">").strip()
            topo = lines[i + 1].strip()
            dgscores = lines[i + 2].replace("/", "").split()
            dgscores = [float(dg) for dg in dgscores]

            seqID = myfunc.GetSeqIDFromAnnotation(annoLine)

            topoWithDGScoreList[cntAlign]['annoline'] = annoLine
            topoWithDGScoreList[cntAlign]['id'] = seqID
            topoWithDGScoreList[cntAlign]['topo'] = topo
            topoWithDGScoreList[cntAlign]['dgscores'] = dgscores
            indexID[seqID] = (cntAlign)
            cntAlign += 1
            i = i + 1
        i += 1
    return (topoWithDGScoreList, indexID)
Beispiel #5
0
def GetFastaID2(infile, fpout):  #{{{
    # The faster version
    isPrintAnnoLine = g_params['isPrintAnnoLine']
    fpin = open(infile, "r")
    buff = fpin.read(BLOCK_SIZE)
    brokenAnnoLine = ""
    ##for the annotation line broken by BLOCK read
    while buff:
        beg = 0
        end = 0
        while 1:
            if brokenAnnoLine:
                end = buff.find("\n")
                if end >= 0:
                    line = brokenAnnoLine + buff[0:end]
                    line = line.lstrip(">").rstrip("\n")
                    idd = myfunc.GetSeqIDFromAnnotation(line)
                    if not isPrintAnnoLine:
                        print >> fpout, idd
                    else:
                        fpout.write("%s\t%s\n" % (idd, line))
                    brokenAnnoLine = ""
                    beg = end
                else:
                    brokenAnnoLine += buff
                    break

            beg = buff.find(">", beg)
            end = buff.find("\n", beg + 1)
            if beg >= 0:
                if end >= 0:
                    line = buff[beg:end]
                    line = line.lstrip(">").rstrip("\n")
                    idd = myfunc.GetSeqIDFromAnnotation(line)
                    if not isPrintAnnoLine:
                        print >> fpout, idd
                    else:
                        fpout.write("%s\t%s\n" % (idd, line))
                    beg = end
                else:
                    brokenAnnoLine = buff[beg:]
                    break
            else:
                break

        buff = fpin.read(BLOCK_SIZE)
    fpin.close()
Beispiel #6
0
def BlastM9toPairlist(infile, fpout):  #{{{
    recordList = ReadBlastOutput(infile,
                                 iteration=g_params['iteration'],
                                 fmt=9)
    #create a new dict to sort the idlist by evalue in  ascending order
    for i in range(min(1, len(recordList))):
        rd = recordList[i]
        print >> fpout, rd['queryID'], myfunc.GetSeqIDFromAnnotation(
            rd['hitID'])
def WriteOnelineSeq(seqWithAnno, idSet, fpout):  #{{{
    "Write sequence in oneline, sequences with redundant IDs are ignored"
    seqid = myfunc.GetSeqIDFromAnnotation(
        seqWithAnno[0:seqWithAnno.find('\n')])
    aaSeq = seqWithAnno[seqWithAnno.find("\n"):].replace('\n',
                                                         '').replace(' ', '')
    if seqid not in idSet:
        seqWithAnno += "\n"
        fpout.write("%s %s\n" % (seqid, aaSeq))
        idSet.add(seqid)
Beispiel #8
0
def ReplaceFastaAnnotation(fastaFile, annotationFile, fpout):
    # this version read in the annotation file first
    cntAnno = 0
    cntLineFasta = 0
    lineFa = ""
    lineAnno = ""

    fpFa = open(fastaFile, "r")
    fpAnno = open(annotationFile, "r")
    lineAnno = fpAnno.readline()
    cntAnno += 1

    while lineAnno:
        lineAnno = lineAnno.rstrip('\n').strip()
        if lineAnno and lineAnno[0] == ">":
            if not (lineFa and lineFa[0]) == ">":
                lineFa = fpFa.readline()
                cntLineFasta += 1
                lineFa = lineFa.rstrip('\n').strip()
            if lineFa and lineFa[0] == ">":
                #starting a new sequnce
                idFa = myfunc.GetSeqIDFromAnnotation(lineFa)
                idAnno = myfunc.GetSeqIDFromAnnotation(lineAnno)
                if idFa != idAnno:
                    print >> sys.stderr, "annotation line does not match, annoRecord=%d, fastaLine=%d" % (
                        cntAnno + 1, cntLineFasta + 1)
                    print >> sys.stderr, "idFa=%s, idAnno=%s" % (idFa, idAnno)
                    sys.exit(1)
                print >> fpout, "%s" % lineAnno
                lineFa = fpFa.readline()
                cntLineFasta += 1
                lineFa = lineFa.rstrip("\n").strip()
                while lineFa and lineFa[0] != ">":
                    print >> fpout, "%s" % lineFa
                    lineFa = fpFa.readline()
                    cntLineFasta += 1
                    lineFa = lineFa.rstrip("\n").strip()
        lineAnno = fpAnno.readline()
        cntAnno += 1
    fpFa.close()
    fpAnno.close()
    return 0
Beispiel #9
0
def GetFromRawSeq(seqWithAnno, isPrintID, isJustPrintSum, fpout):#{{{
#     begseq=seqWithAnno.find("\n")
#     seq=seqWithAnno[begseq:]
#     seq=seq.replace('\n','').replace(' ','')
    length=len(seqWithAnno[seqWithAnno.find("\n"):].replace('\n','').replace(' ',''))
    if not isJustPrintSum:
        if isPrintID:
            seqID=myfunc.GetSeqIDFromAnnotation(seqWithAnno)
            fpout.write("%s\t"%seqID)
        fpout.write("%d\n" % length)
    return length
Beispiel #10
0
def GetFastaID(infile, fpout):  #{{{
    fpin = open(infile, "r")
    line = fpin.readline()
    while line:
        line = line.rstrip('\n').strip()
        if line and line[0] == ">":
            idd = myfunc.GetSeqIDFromAnnotation(line)
            print >> fpout, idd
        line = fpin.readline()
    fpin.close()
    return 0
Beispiel #11
0
def GetHMMScore(inFile):#{{{
# return (idList, lengthList, normLogList, logoddsList, reversiList, isTMProList);
    idList=[];
    lengthList=[];
    normLogList=[];
    logoddsList=[];
    reversiList=[];
    isTMProList=[];
    fpin = open(inFile, "r");
    line = fpin.readline();
    while line:
        if line.find("SeqID:")>=0:
            seqID=".";
            normLog=".";
            logodds=".";
            reversi=".";
            isTMPro=".";
            length=0;
            strs=line.split(":");
            if strs[1].strip():
                seqID=strs[1].strip();
            while 1: 
                line = fpin.readline();
                if not line.strip():
                    break;
                strs=line.split(":");
                if strs[0] == "SeqLength":
                    length=int(strs[1]);
                elif strs[0] == "NormalizedLogLikelihood":
                    normLog=strs[1].strip();
                elif strs[0] == "Logodds":
                    logodds=strs[1].strip();
                elif strs[0] == "Reversi":
                    reversi=strs[1].strip();
                elif strs[0] == "IsTMProtein":
                    isTMPro=strs[1].strip();
                elif strs[0][0] == ">":
                    if seqID=='.':
                        seqID=myfunc.GetSeqIDFromAnnotation(strs[0]);
            idList.append(seqID);
            lengthList.append(length);
            normLogList.append(normLog);
            logoddsList.append(logodds);
            reversiList.append(reversi);
            isTMProList.append(isTMPro);
        line = fpin.readline();

    fpin.close();
    return (idList, lengthList, normLogList, logoddsList, reversiList, isTMProList);
Beispiel #12
0
def OutputSplittedSeq(
        seqWithAnno,
        rootname,
        cntsplit,  #{{{
        cntseq_of_split,
        fpout):
    # return (cntsplit, cntseq_of_split, fpout)
    begseq = seqWithAnno.find("\n")
    seq = seqWithAnno[begseq:]
    seqID = myfunc.GetSeqIDFromAnnotation(seqWithAnno[0:begseq])
    outfile = ""

    if cntseq_of_split < g_params['numseq_per_split']:
        if fpout == None:
            if g_params['isNameFileSequentially']:
                outfile = (g_params['outpath'] + os.sep + rootname +
                           "_%d" % cntsplit + "." + g_params['file_ext'])
            else:
                if seqID == "":
                    seqID = rootname + "_%d" % cntsplit
                outfile = (g_params['outpath'] + os.sep + seqID + "." +
                           g_params['file_ext'])

            try:
                fpout = open(outfile, "w")
            except IOError:
                print >> sys.stderr, "Failed to write to file %s" % outfile
                return 1

        fpout.write("%s" % seqWithAnno[0:begseq])
        fpout.write("%s\n" % seq)

        cntseq_of_split += 1
        if cntseq_of_split >= g_params['numseq_per_split']:
            fpout.close()
            fpout = None
            cntseq_of_split = 0
            cntsplit += 1
        if g_params['verbose'] >= 2:
            if outfile != "":
                print >> sys.stdout, "split %d\t%s output" % (cntsplit,
                                                              outfile)
    else:
        msg = "Error! cntseq_of_split (%d) >= numseq_per_split (%d)"
        print >> sys.stderr, msg % (cntseq_of_split,
                                    g_params['numseq_per_split'])

    return (cntsplit, cntseq_of_split, fpout)
def ReadSignalPFile(infile):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    dt = {}
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line == "" or line[0] == "#":
                continue
            #seqid = myfunc.GetFirstWord(line)
            seqid = myfunc.GetSeqIDFromAnnotation(line)
            dt[seqid] = line
        lines = hdl.readlines()
    hdl.close()
    return dt
def SelectLineByID(infile, idListSet, fpout):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    method_getid = g_params['method_getid']
    sel_field_list = g_params['sel_field_list']
    if method_getid == 3:
        if len(sel_field_list) == 0:
            sel_field = 0
        elif len(sel_field_list) == 1:
            sel_field = sel_field_list[0]


    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                fpout.write("%s\n"%line)
            else:
                try:
                    if method_getid == 0:
                        idd = line.split(None, 1)[0]
                    elif method_getid == 1:
                        idd = (line.split(None, 1)[0]).partition(";")[0]
                    elif method_getid == 2:
                        idd = myfunc.GetSeqIDFromAnnotation(line)
                    elif method_getid == 3:
                        if len(sel_field_list) < 2:
                            idd = line.split()[sel_field-1]
                        else:
                            strs = line.split()
                            tmpli = []
                            for ff in sel_field_list:
                                tmpli.append(strs[ff-1])
                            idd = tuple(tmpli)
                    else:
                        print method_getid
                except (IndexError):
                    print >> sys.stderr, ("Bad line \"%s\"\n"%line)
                if idd in idListSet:
                    fpout.write("%s\n"%line)
        lines = hdl.readlines()
    hdl.close()
    return 0
def ReadSingleFasta_modhmm(inFile):
    seqID = ""
    aaSeq = ""
    annotation = ""
    try:
        fpin = open(inFile, "r")
        line = fpin.readline()
        while line:
            line = line.rstrip('\n').strip()
            if line:
                if line[0] == ">":
                    seqID = myfunc.GetSeqIDFromAnnotation(line)
                    annotation = line.lstrip(">").strip()
                elif line[0] != "/":  #neglecting membrane topology labelling
                    aaSeq = aaSeq + line
            line = fpin.readline()
        fpin.close()
    except:
        print >> sys.stderr, "Except for the input file ", inFile, "in the function ReadSingleFasta"
    return (seqID, annotation, aaSeq)
def ReadSignalPeptide_signalp(infile):
    try:
        signalpDict = {}
        fpin = open(infile, "r")
        lines = fpin.readlines()
        fpin.close()
        for line in lines:
            if line[0] != "#":
                strs = line.split()
                try:
                    status = strs[9]
                    if status == "Y":
                        seqid = myfunc.GetSeqIDFromAnnotation(strs[0])
                        signalpDict[seqid] = int(strs[2])
                except IndexError:
                    pass
        return signalpDict
    except IOError:
        print >> sys.stderr, "Failed to read infile %s"%infile
        return {}
Beispiel #17
0
def WriteIndexFasta(seqWithAnno, fpdb, dbname, fpindex, cntdbfile, #{{{
        record_offset, idSet, idtype):
    """Write sequence to indexed fasta file, sequences with redundant IDs are
    ignored"""
    if idtype == 0:
        seqid = myfunc.GetSeqIDFromAnnotation(seqWithAnno)
    elif idtype == 1:
        seqid = myfunc.GetFirstWord(seqWithAnno.lstrip(">"))
    if seqid in idSet:
        return (fpdb, record_offset)
    else:
        seqWithAnno+="\n"
        if fpdb == None:
            dbfile=dbname+"%d.db"%(cntdbfile)
            fpdb=open(dbfile, "wb")
            print "dbfile %s is created."%dbfile
        fpindex.write("%s %d %d %d\n"%(seqid, cntdbfile, record_offset,
            len(seqWithAnno)))
        fpdb.write("%s"%seqWithAnno)
        record_offset += len(seqWithAnno)
        idSet.add(seqid)
        return (fpdb,record_offset)
def Labeltopologyfastaseq(queryTopoFile, alignFile, fastaFile, fpout):  #{{{
    #     fptmp=open(queryTopoFile);
    #     print fptmp.readlines();
    #     fptmp.close();
    try:
        (queryID, queryAnnotation,
         queryTopology) = myfunc.ReadSingleFasta(queryTopoFile)
        # read in alignment
        alns = ReadNeedleAlignment(alignFile)
        #         print alns;
        topologyLabels = GetTopologyLabels(queryTopology, alns)

        fpin = open(fastaFile, "r")
        lines = fpin.readlines()
        fpin.close()

        i = 0
        while i < len(lines):
            line = lines[i]
            if line[0] == '>':
                seqID = myfunc.GetSeqIDFromAnnotation(line)
                aaSeq = ""
                fpout.write("%s" % line)
                i = i + 1
                while i < len(lines) and lines[i][0] != '>':
                    fpout.write("%s" % lines[i])
                    aaSeq += lines[i].strip()
                    i = i + 1
                fpout.write("/%s/\n" % topologyLabels[seqID])
                if len(aaSeq) != len(topologyLabels[seqID]):
                    print >> sys.stderr, "%s: length not match" % seqID
    except:
        print >> sys.stderr, "except for the function:%s" % sys._getframe(
        ).f_code.co_name
        raise
    return 0
def ReadHHAlignResult(infile):
    try:
        fpin = open(infile, "r")
        lines = fpin.read().split("\n")
        fpin.close()
        hitList = []
        numLine = len(lines)
        cnt = 0
        while cnt < numLine:
            if cnt < numLine and lines[cnt][0:5] == "Query":
                query_description = lines[cnt][14:]
                query_seqid = myfunc.GetSeqIDFromAnnotation(query_description)
                cnt += 1
            elif cnt < numLine and lines[cnt].find("Match_columns") == 0:
                try:
                    query_legnth = int(lines[cnt].split()[1])
                except (IndexError, ValueError, TypeError):
                    print >> sys.stderr, "Bad line:\"%s\"" % (lines[cnt])
                    raise
                cnt += 1
            elif lines[cnt][0:7] == " No Hit":
                cnt += 1
                while cnt < numLine and lines[cnt] != "":
                    hit = {}
                    #print lines[cnt]
                    (hit['prob'], hit['evalue'], hit['pvalue'], hit['score'],
                     hit['num_align_col'], hit['pos_query_begin'],
                     hit['pos_query_end'], hit['pos_template_begin'],
                     hit['pos_template_end'],
                     hit['template_length']) = ScanHHHitLine(lines[cnt])
                    hit['query_length'] = query_legnth
                    hit['query_seqid'] = query_seqid
                    hit['query_description'] = query_description
                    hitList.append(hit)
                    cnt += 1
                cnt += 1
            elif cnt < numLine and lines[cnt][0:3] == "No ":
                j = 0
                jstart = cnt
                try:
                    hitIndex = int(lines[cnt].split()[1]) - 1
                except (IndexError, ValueError, TypeError):
                    print >> sys.stderr, "Bad hit line: %s" % lines[cnt]
                    raise
                try:
                    hit = hitList[hitIndex]
                except IndexError:
                    print >> sys.stderr, "hitIndex=%d, numHit=%d" % (
                        hitIndex, len(hitList))
                    raise
                j += 1
                hit['hit_description'] = lines[jstart + j].lstrip(">")
                hit['hit_seqid'] = myfunc.GetSeqIDFromAnnotation(
                    hit['hit_description'])
                j += 1
                hit['statline'] = lines[jstart + j]
                (hit['prob'], hit['evalue'], hit['score'], hit['alignedcol'],
                 hit['identity'], hit['similarity'],
                 hit['sum_prob']) = ScanHHAlignStatLine(hit['statline'])
                j += 2
                alnseqList1 = []
                alnseqList2 = []
                while lines[jstart + j][0:2] == "Q ":
                    try:
                        l1 = lines[jstart + j][17:]
                        l2 = lines[jstart + j + 4][17:]
                        s1 = l1.split()[1]
                        s2 = l2.split()[1]
                        alnseqList1.append(s1)
                        alnseqList2.append(s2)
                        if lines[jstart + j + 5].strip() != "":
                            j += 8
                        else:
                            j += 7
                    except IndexError:
                        msg = "Bad hhr result file %s. l1=%s, l2=%s"
                        print >> sys.stderr, msg % (infile, l1, l2)
                        sys.exit(1)
                hit['query_alignseq'] = "".join(alnseqList1)
                hit['template_alignseq'] = "".join(alnseqList2)
                cnt += j
            elif lines[cnt][0:4] == "Done":
                break
            else:
                cnt += 1
        return hitList
    except IOError:
        print >> sys.stderr, "Failed to read infile %s" % (infile)
        return []
                 $dbname0.db
  -q             Quiet mode
  -h, --help     Print this help message and exit

Created 2011-12-09, updated 2011-12-09, Nanjiang Shu 
"""
MAX_DBFILE_SIZE=2*1024*1024*1024;
BLOCK_SIZE=100000;

def PrintHelp():
    print usage;

def WriteIndex(fpdb, record_offset) =  WriteIndex(recordList, fpdb, dbname, #{{{
                    fpindex, cntdbfile, record_offset, processedTopoIDSet);
    "Write sequence to indexed fasta file, sequences with redundant IDs are ignored"
    seqid = myfunc.GetSeqIDFromAnnotation(seqWithAnno);
    if seqid in idSet:
        return (fpdb, record_offset);
    else:
        seqWithAnno+="\n";
        if fpdb == None:
            dbfile=dbname+"%d.db"%(cntdbfile);
            fpdb=open(dbfile, "wb");
            print "dbfile %s is created."%dbfile;
        fpindex.write("%s %d %d %d\n"%(seqid, cntdbfile, record_offset, len(seqWithAnno)));
        fpdb.write("%s"%seqWithAnno);
        record_offset += len(seqWithAnno);
        idSet.add(seqid);
        return (fpdb,record_offset);
    
#}}}
def ExtractFromTopconsResult(recordContent):  #{{{
    #     print
    #     print "recordContent:"
    #     print "=============================="
    #     print recordContent
    #     print "=============================="
    #     print
    record = {}
    record['anno'] = ""
    record['rlty'] = -1
    topoNameList = [
        'predtopo_TOPCONS2', 'predtopo_OCTOPUS', 'predtopo_Philius',
        'predtopo_PolyPhobius', 'predtopo_SCAMPI_msa', 'predtopo_SPOCTOPUS'
    ]
    for name in topoNameList:
        record[name] = ""

    lines = recordContent.split("\n")
    numLine = len(lines)
    i = 0
    while i < numLine:
        #         print i
        if lines[i].find("Sequence name") == 0:
            record['anno'] = lines[i][15:]
            i += 1
        elif lines[i].find('SCAMPI predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_SCAMPI_msa'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find('Philius predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_Philius'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find('PolyPhobius predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_PolyPhobius'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find('OCTOPUS predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_OCTOPUS'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find('SPOCTOPUS predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_SPOCTOPUS'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find('TOPCONS predicted topology') == 0:
            j = 1
            while lines[i + j] != "":
                record['predtopo_TOPCONS2'] += lines[i + j]
                j += 1
            i += j
        elif lines[i].find("Predicted TOPCONS reliability") == 0:
            j = 1
            sumrlty = 0.0
            cnt = 0
            while lines[i + j][0:1].isdigit():
                ss = lines[i + j].split()
                if len(ss) == 2:
                    sumrlty += float(ss[1])
                    cnt += 1
                j += 1
            if cnt > 0:
                record['rlty'] = sumrlty / (cnt)
            i += j
        else:
            i += 1
    record['rlty'] *= 100.0
    record['seqid'] = myfunc.GetSeqIDFromAnnotation(record['anno'])

    if record['predtopo_TOPCONS2'] != "":
        record['seqlength'] = len(record['predtopo_TOPCONS2'])
        for name in topoNameList:
            if record[name].find("No TM-regions predicted") != -1:
                record[name] = ""
        #print record
        return record
    else:
        return {}
def Topcons2Fasta_method1(infile, outpath):  #{{{
    try:
        rootname = os.path.basename(os.path.splitext(infile)[0])
        outfile_SPlist = outpath + os.sep + rootname + "_TOPCONS.sp_list"
        outfile_TOPCONS = outpath + os.sep + rootname + "_TOPCONS.topo"
        outfile_TOPCONS_m1 = outpath + os.sep + rootname + "_TOPCONS.m1.topo"
        outfile_TOPCONS_filterSP = outpath + os.sep + rootname + "_TOPCONS_filterSP.topo"
        outfile_agreement = outpath + os.sep + rootname + ".agreement.stat.txt"
        logfile1 = outpath + os.sep + rootname + ".m1.idt.log"
        logfile2 = outpath + os.sep + rootname + ".m1.nonidt.log"
        logfile3 = outpath + os.sep + rootname + ".m1.idt.but.numpred.lt.4.log"
        fpout_SPlist = open(outfile_SPlist, "w")
        fpout_TOPCONS = open(outfile_TOPCONS, "w")
        fpout_TOPCONS_m1 = open(outfile_TOPCONS_m1, "w")
        fpout_TOPCONS_filterSP = open(outfile_TOPCONS_filterSP, "w")
        fplog1 = open(logfile1, "w")
        fplog2 = open(logfile2, "w")
        fplog3 = open(logfile3, "w")
        fpout_agree = open(outfile_agreement, "w")
        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = Read_topcons_result_from_buffer(
                buff, recordList, isEOFreached)
            if len(recordList) > 0:
                for record in recordList:
                    if record['predtopo_TOPCONS2'] == "":
                        continue
                    seqid = myfunc.GetSeqIDFromAnnotation(record['anno'])
                    topoList = []
                    topoList.append(record['predtopo_OCTOPUS'])
                    topoList.append(record['predtopo_SPOCTOPUS'])
                    topoList.append(record['predtopo_SCAMPI_msa'])
                    topoList.append(record['predtopo_PolyPhobius'])
                    topoList.append(record['predtopo_Philius'])
                    #                     print "======================="
                    #                     print seqid
                    #                     print topoList
                    #                     print "======================="
                    # Annotation: matchList, matching the target topology to ordered topology list
                    # 1 for identical, 0 for non identical and -1 for empty topology
                    (matchList, numIDTtopo, numPredictor) = lcmp.MatchTopology(
                        record['predtopo_TOPCONS2'], topoList, min_TM_overlap,
                        seqid)

                    fpout_agree.write("%s\t%d\t%d" %
                                      (seqid, numIDTtopo, numPredictor))
                    for tt in matchList:
                        fpout_agree.write("\t%d" % (tt))
                    fpout_agree.write("\t%6.2f" % (record['rlty']))
                    fpout_agree.write("\n")

                    msg =  ">%s TOPCONS RLTY=%.2f" \
                            "numIDTtopo=%d numPredictor=%d\n"
                    if record['predtopo_TOPCONS2'].find('S') >= 0:
                        pp = record['predtopo_TOPCONS2'].rfind('S')
                        fpout_SPlist.write("%s %d %s\n" %
                                           (record['seqid'], pp, 'Y'))

                    if record['predtopo_TOPCONS2'].find('M') >= 0:
                        fpout_TOPCONS.write(
                            msg % (record['anno'], record['rlty'], numIDTtopo,
                                   numPredictor))
                        fpout_TOPCONS.write("%s\n" %
                                            record['predtopo_TOPCONS2'])
                        if record['predtopo_TOPCONS2'].find('S') >= 0:
                            pp = record['predtopo_TOPCONS2'].rfind('S')
                            iostat = record['predtopo_TOPCONS2'][pp + 1]
                            top = record['predtopo_TOPCONS2'].replace(
                                'S', iostat)
                        else:
                            top = record['predtopo_TOPCONS2']
                        fpout_TOPCONS_filterSP.write(
                            msg % (record['anno'], record['rlty'], numIDTtopo,
                                   numPredictor))
                        fpout_TOPCONS_filterSP.write("%s\n" % top)

                    if numIDTtopo == numPredictor:
                        if numPredictor >= 4:
                            msg =  ">%s TOPCONS RLTY=%.2f" \
                                    "numIDTtopo=%d numPredictor=%d\n"
                            fpout_TOPCONS_m1.write(
                                msg % (record['anno'], record['rlty'],
                                       numIDTtopo, numPredictor))
                            fpout_TOPCONS_m1.write("%s\n" %
                                                   record['predtopo_TOPCONS2'])
                            msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d"
                            print >> fplog1, msg % (seqid, record['rlty'],
                                                    numIDTtopo, numPredictor)
                        else:
                            msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d"
                            print >> fplog3, msg % (seqid, record['rlty'],
                                                    numIDTtopo, numPredictor)
                    else:
                        msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d"
                        print >> fplog2, msg % (seqid, record['rlty'],
                                                numIDTtopo, numPredictor)
            if isEOFreached == True:
                break
        fpin.close()
        fpout_SPlist.close()
        fpout_TOPCONS.close()
        fpout_TOPCONS_m1.close()
        fpout_TOPCONS_filterSP.close()
        fplog1.close()
        fplog2.close()
        fplog3.close()
        fpout_agree.close()
        print "Result have been output to"
        print "\t%s" % outfile_TOPCONS
        print "\t%s" % outfile_agreement
        print "\t%s" % logfile1
        print "\t%s" % logfile2
        print "\t%s" % logfile3

    except IOError:
        msg = "Failed to read file {} in function {}"
        print >> sys.stderr, msg.format(infile, sys._getframe().f_code.co_name)
def Topcons2Fasta(infile, outpath):  #{{{
    try:
        rootname = os.path.basename(os.path.splitext(infile)[0])
        outfile_TOPCONS2 = outpath + os.sep + rootname + "_TOPCONS2.topo"
        outfile_OCTOPUS = outpath + os.sep + rootname + "_OCTOPUS.topo"
        outfile_SPOCTOPUS = outpath + os.sep + rootname + "_SPOCTOPUS.topo"
        outfile_SCAMPI_msa = outpath + os.sep + rootname + "_SCAMPI_msa.topo"
        outfile_Philius = outpath + os.sep + rootname + "_Philius.topo"
        outfile_PolyPhobius = outpath + os.sep + rootname + "_PolyPhobius.topo"
        outRLTYFile = outpath + os.sep + rootname + "_TOPCONS.rlty"
        fpout_TOPCONS2 = open(outfile_TOPCONS2, "w")
        fpout_OCTOPUS = open(outfile_OCTOPUS, "w")
        fpout_SPOCTOPUS = open(outfile_SPOCTOPUS, "w")
        fpout_SCAMPI_msa = open(outfile_SCAMPI_msa, "w")
        fpout_Philius = open(outfile_Philius, "w")
        fpout_PolyPhobius = open(outfile_PolyPhobius, "w")
        fpout_rlty = open(outRLTYFile, "w")
        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = Read_topcons_result_from_buffer(
                buff, recordList, isEOFreached)
            if len(recordList) > 0:
                for record in recordList:
                    seqid = myfunc.GetSeqIDFromAnnotation(record['anno'])

                    if record['predtopo_TOPCONS2'] != "":
                        fpout_TOPCONS.write(
                            ">%s predtopo_TOPCONS2 rlty=%.2f\n" %
                            (record['anno'], record['rlty']))
                        fpout_TOPCONS.write("%s\n" %
                                            record['predtopo_TOPCONS2'])
                    if record['predtopo_OCTOPUS'] != "":
                        fpout_OCTOPUS.write(">%s predtopo_OCTOPUS\n" %
                                            (record['anno']))
                        fpout_OCTOPUS.write("%s\n" %
                                            record['predtopo_OCTOPUS'])
                    if record['predtopo_SPOCTOPUS'] != "":
                        fpout_SPOCTOPUS.write(">%s predtopo_SPOCTOPUS\n" %
                                              (record['anno']))
                        fpout_SPOCTOPUS.write("%s\n" %
                                              record['predtopo_SPOCTOPUS'])
                    if record['predtopo_SCAMPI_msa'] != "":
                        fpout_SCAMPI_msa.write(">%s predtopo_SCAMPI_msa\n" %
                                               (record['anno']))
                        fpout_SCAMPI_msa.write("%s\n" %
                                               record['predtopo_SCAMPI_msa'])
                    if record['predtopo_Philius'] != "":
                        fpout_Philius.write(">%s predtopo_Philius\n" %
                                            (record['anno']))
                        fpout_Philius.write("%s\n" %
                                            record['predtopo_Philius'])
                    if record['predtopo_PolyPhobius'] != "":
                        fpout_PolyPhobius.write(">%s predtopo_PolyPhobius\n" %
                                                (record['anno']))
                        fpout_PolyPhobius.write("%s\n" %
                                                record['predtopo_PolyPhobius'])

                    if record['rlty'] != -100.0:
                        fpout_rlty.write("%s %.2f\n" % (seqid, record['rlty']))
            if isEOFreached == True:
                break
        fpin.close()
        fpout_TOPCONS.close()
        fpout_OCTOPUS.close()
        fpout_SCAMPI_seq.close()
        fpout_SCAMPI_msa.close()
        fpout_PRODIV.close()
        fpout_PRO.close()
        fpout_rlty.close()

        print "Results have been output to"
        print "\t%s" % outfile_TOPCONS
        print "\t%s" % outfile_OCTOPUS
        print "\t%s" % outfile_SCAMPI_seq
        print "\t%s" % outfile_SCAMPI_msa
        print "\t%s" % outfile_PRODIV
        print "\t%s" % outfile_PRO
        print "\t%s" % outRLTYFile

    except IOError:
        print >> sys.stderr, "Failed to open file %s for read" % (infile)
        raise
def TopconsSingle2Fasta(infile, outpath):  #{{{
    try:
        rootname = os.path.basename(os.path.splitext(infile)[0])
        outfile_topcons_single = outpath + os.sep + rootname + "_topcons_single.topo"
        outfile_scampi_single = outpath + os.sep + rootname + "_scampi_single.topo"
        outfile_hmmtop = outpath + os.sep + rootname + "_hmmtop.topo"
        outfile_stmhmm = outpath + os.sep + rootname + "_stmhmm.topo"
        outfile_memsat = outpath + os.sep + rootname + "_memsat.topo"
        outRLTYFile = outpath + os.sep + rootname + "_topcons_single.rlty"
        fpout_topcons_single = open(outfile_topcons_single, "w")
        fpout_scampi_single = open(outfile_scampi_single, "w")
        fpout_hmmtop = open(outfile_hmmtop, "w")
        fpout_stmhmm = open(outfile_stmhmm, "w")
        fpout_memsat = open(outfile_memsat, "w")
        fpout_rlty = open(outRLTYFile, "w")
        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = Read_topconssingle_result_from_buffer(
                buff, recordList, isEOFreached)
            if len(recordList) > 0:
                for record in recordList:
                    seqid = myfunc.GetSeqIDFromAnnotation(record['anno'])
                    if record['predtopo_topcons_single'] != "":
                        fpout_topcons_single.write(
                            ">%s topcons_single rlty=%.2f\n" %
                            (record['anno'], record['rlty']))
                        fpout_topcons_single.write(
                            "%s\n" % record['predtopo_topcons_single'])
                    if record['predtopo_scampi_single'] != "":
                        fpout_scampi_single.write(">%s scampi_single\n" %
                                                  (record['anno']))
                        fpout_scampi_single.write(
                            "%s\n" % record['predtopo_scampi_single'])
                    if record['predtopo_hmmtop'] != "":
                        fpout_hmmtop.write(">%s hmmtop\n" % (record['anno']))
                        fpout_hmmtop.write("%s\n" % record['predtopo_hmmtop'])
                    if record['predtopo_stmhmm'] != "":
                        fpout_stmhmm.write(">%s stmhmm\n" % (record['anno']))
                        fpout_stmhmm.write("%s\n" % record['predtopo_stmhmm'])
                    if record['predtopo_memsat'] != "":
                        fpout_memsat.write(">%s memsat\n" % (record['anno']))
                        fpout_memsat.write("%s\n" % record['predtopo_memsat'])
                    if record['rlty'] != -100.0:
                        fpout_rlty.write("%s %.2f\n" % (seqid, record['rlty']))
            if isEOFreached == True:
                break
        fpin.close()
        fpout_topcons_single.close()
        fpout_scampi_single.close()
        fpout_memsat.close()
        fpout_stmhmm.close()
        fpout_hmmtop.close()
        fpout_rlty.close()

        print "Result have been output to"
        print "\t%s" % outfile_topcons_single
        print "\t%s" % outfile_scampi_single
        print "\t%s" % outfile_hmmtop
        print "\t%s" % outfile_stmhmm
        print "\t%s" % outfile_memsat
        print "\t%s" % outRLTYFile

    except IOError:
        print >> sys.stderr, "Failed to open file %s for read" % (infile)
        raise
Beispiel #25
0
    pfamidList = []

    extra_desp_dict = {}
    if extra_description_file != "":
        hdl_extra = myfunc.ReadLineByBlock(extra_description_file)
        if hdl_extra.failure:
            print >> sys.stderr, "Failed to read extra_description_file %s." % (
                extra_description_file)
            return 1
        lines = hdl_extra.readlines()
        while lines != None:
            for line in lines:
                line = line.strip()
                if not line or line[0] == "#":
                    continue
                seqid = myfunc.GetSeqIDFromAnnotation(line)
                if seqid != "":
                    extra_desp_dict[seqid] = line
            lines = hdl_extra.readlines()

    hdl = myfunc.ReadLineByBlock(mapfile)
    if hdl.failure:
        print >> sys.stderr, "Failed to read mapfile %s. exit" % (mapfile)
        return 1

    cntfam = 0
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            line = line.strip()
            if not line or line[0] == "#":
Beispiel #26
0
def WriteSubconsTextResultFile(
        outfile,
        outpath_result,
        maplist,  #{{{
        runtime_in_sec,
        base_www_url,
        statfile=""):
    try:
        fpout = open(outfile, "w")
        if statfile != "":
            fpstat = open(statfile, "w")

        date_str = time.strftime(FORMAT_DATETIME)
        print >> fpout, "##############################################################################"
        print >> fpout, "Subcons result file"
        print >> fpout, "Generated from %s at %s" % (base_www_url, date_str)
        print >> fpout, "Total request time: %.1f seconds." % (runtime_in_sec)
        print >> fpout, "##############################################################################"
        cnt = 0
        for line in maplist:
            strs = line.split('\t')
            subfoldername = strs[0]
            length = int(strs[1])
            desp = strs[2]
            seq = strs[3]
            seqid = myfunc.GetSeqIDFromAnnotation(desp)
            print >> fpout, "Sequence number: %d" % (cnt + 1)
            print >> fpout, "Sequence name: %s" % (desp)
            print >> fpout, "Sequence length: %d aa." % (length)
            print >> fpout, "Sequence:\n%s\n\n" % (seq)

            rstfile = "%s/%s/%s/query_0_final.csv" % (outpath_result,
                                                      subfoldername, "plot")

            if os.path.exists(rstfile):
                content = myfunc.ReadFile(rstfile).strip()
                lines = content.split("\n")
                if len(lines) >= 6:
                    header_line = lines[0].split("\t")
                    if header_line[0].strip() == "":
                        header_line[0] = "Method"
                        header_line = [x.strip() for x in header_line]

                    data_line = []
                    for i in xrange(1, len(lines)):
                        strs1 = lines[i].split("\t")
                        strs1 = [x.strip() for x in strs1]
                        data_line.append(strs1)

                    content = tabulate.tabulate(data_line, header_line,
                                                'plain')
            else:
                content = ""
            if content == "":
                content = "***No prediction could be produced with this method***"

            print >> fpout, "Prediction results:\n\n%s\n\n" % (content)

            print >> fpout, "##############################################################################"
            cnt += 1

    except IOError:
        print "Failed to write to file %s" % (outfile)
def TopconsSingle2Fasta_method1(infile, outpath):  #{{{
    try:
        rootname = os.path.basename(os.path.splitext(infile)[0])
        outfile_topcons_single = outpath + os.sep + rootname + "_topcons_single.m1.topo"
        outfile_agreement = outpath + os.sep + rootname + ".agreement.stat.txt"
        logfile1 = outpath + os.sep + rootname + ".m1.idt.log"
        logfile2 = outpath + os.sep + rootname + ".m1.nonidt.log"
        fpout_topcons_single = open(outfile_topcons_single, "w")
        fplog1 = open(logfile1, "w")
        fplog2 = open(logfile2, "w")
        fpout_agree = open(outfile_agreement, "w")

        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = Read_topconssingle_result_from_buffer(
                buff, recordList, isEOFreached)
            if len(recordList) > 0:
                for record in recordList:
                    # ignore cases where topcons_single does not give a prediction
                    if record['predtopo_topcons_single'] == "":
                        continue
                    seqid = myfunc.GetSeqIDFromAnnotation(record['anno'])
                    topoList = []
                    topoList.append(record['predtopo_scampi_single'])
                    topoList.append(record['predtopo_hmmtop'])
                    topoList.append(record['predtopo_stmhmm'])
                    topoList.append(record['predtopo_memsat'])
                    (matchList, numIDTtopo, numPredictor) = lcmp.MatchTopology(
                        record['predtopo_topcons_single'], topoList,
                        min_TM_overlap, seqid)
                    fpout_agree.write("%s\t%d\t%d" %
                                      (seqid, numIDTtopo, numPredictor))
                    for tt in matchList:
                        fpout_agree.write("\t%d" % (tt))
                    fpout_agree.write("\t%6.2f" % (record['rlty']))
                    fpout_agree.write("\n")

                    if numIDTtopo == numPredictor and numIDTtopo >= 2:
                        msg = ">%s topcons_single RLTY=%.2f"\
                                "numIDTtopo=%d numPredictor=%d\n"
                        fpout_topcons_single.write(
                            msg % (record['anno'], record['rlty'], numIDTtopo,
                                   numPredictor))
                        fpout_topcons_single.write(
                            "%s\n" % record['predtopo_topcons_single'])
                        msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d"
                        print >> fplog1, msg % (seqid, record['rlty'],
                                                numIDTtopo, numPredictor)
                    else:
                        msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d"
                        print >> fplog2, msg % (seqid, record['rlty'],
                                                numIDTtopo, numPredictor)

            if isEOFreached == True:
                break
        fpin.close()
        fpout_topcons_single.close()
        fplog1.close()
        fplog2.close()
        fpout_agree.close()

        print "Result have been output to"
        print "\t%s" % outfile_topcons_single
        print "\t%s" % outfile_agreement
        print "\t%s" % logfile1
        print "\t%s" % logfile2

    except IOError:
        msg = "Failed to read file {} in function {}"
        print >> sys.stderr, msg.format(infile, sys._getframe().f_code.co_name)
        return 1