Esempio n. 1
0
    def run_spaan(inputFasta, outputDir, rawFlag):
        command = SPAAN_PATH
        rawOutput = os.path.join(outputDir, "SPAAN")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        os.system(
            "%s %s %s" %
            (command, inputFasta,
             os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)))

        output = ['ID\tSPAAN_Score']
        values = {}
        for row in open(
                os.path.join(rawOutput, "%s.output" %
                             Path(inputFasta).stem)).read().split('\n')[1:]:
            if row == '':
                continue
            tokens = row.split('\t')
            values[tokens[2].strip('>')] = tokens[1]
        for fastaID in sequenceIDs:
            if fastaID in values.keys():
                output.append('\t'.join([fastaID, values[fastaID]]))
            else:
                output.append(fastaID + '\t0.0')
        open(os.path.join(rawOutput, "%s.spaan.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
        if not rawFlag:
            os.remove(
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
Esempio n. 2
0
    def run_psortb(inputFasta, outputDir, organism, multiFlag, process,
                   rawFlag):
        command = PSORTB_PATH
        rawOutput = os.path.join(outputDir, "PSORTB")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        for file in os.listdir(rawOutput):
            if 'psortb_grampos.txt' in file:
                os.remove(os.path.join(rawOutput, file))

        if organism.lower() in ["gram+", "g+"]:
            os.system("%s -p -i %s" % (command, inputFasta))
            for file in os.listdir(rawOutput):
                if 'psortb_grampos.txt' in file:
                    rawFile = file
                    break
        elif organism.lower() in ["gram-", "g-"]:
            os.system("%s -n -i %s" % (command, inputFasta))
            for file in os.listdir(rawOutput):
                if 'psortb_gramneg.txt' in file:
                    rawFile = file
                    break

        output = [
            '\t'.join([
                "ID", "SubcellularLocation", "Extracellular_Probability",
                "CytoplasmicMembrane_Probability", "Cytoplasmic_Probability",
                "Cellwall_Probability", "Periplasmic_Probability",
                "OuterMembrane_Probability"
            ])
        ]
        locs = [
            "Extracellular", "CytoplasmicMembrane", "Cytoplasmic", "Cellwall",
            "Periplasmic", "OuterMembrane"
        ]
        values = {}
        for entry in re.split('[-]{79}',
                              open(os.path.join(rawOutput,
                                                rawFile)).read())[:-1]:
            fastaID = entry.strip().split('\n')[0][7:].strip()
            value = [""] + ["0.0"] * (len(locs))
            value[0] = re.split(
                '[ ]+',
                entry[entry.find("Final Prediction:"):].split('\n')[1])[1]
            for row in entry[entry.find("Localization Scores:"):entry.
                             find("Final Prediction:")].split('\n')[1:-1]:
                tokens = re.split('[ ]+', row)
                value[locs.index(tokens[1]) + 1] = str(float(tokens[2]) / 10.0)
            values[fastaID] = value
        for fastaID in sequenceIDs:
            output.append('\t'.join([fastaID] + values[fastaID]))
        open(os.path.join(rawOutput, "%s.psortb.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
        if not rawFlag:
            os.remove(os.path.join(rawOutput, rawFile))
Esempio n. 3
0
 def split_files(inputFasta, tmpDir):
     fasta = readfasta.readFasta(inputFasta, key="full", strip=False)
     sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")
     inFiles = []
     outFiles = []
     split = int(math.ceil(len(sequenceIDs)) / SPLIT_LIMIT)
     size = int(math.ceil(len(sequenceIDs) / float(split + 1)))
     for i in range(split + 1):
         textBuffer = ''
         for fastaID in list(fasta.keys())[i * size:(i + 1) * size]:
             textBuffer = textBuffer + fastaID + fasta[fastaID]
         inFile = os.path.join(tmpDir, "input.fasta.%i" % i)
         open(inFile, 'w').write(textBuffer)
         inFiles.append(inFile)
         outFile = os.path.join(tmpDir, "output.raw.%i" % i)
         outFiles.append(outFile)
     return (inFiles, outFiles)
Esempio n. 4
0
    def run_immugen(inputFasta, outputDir, rawFlag):
        command = IMGEN_PATH
        rawOutput = os.path.join(outputDir, "IMGEN")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        fasta = readfasta.readFasta(inputFasta, key="full")
        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        tmpInput = os.path.join(rawOutput, "sequence.tmp")
        open(tmpInput, 'w').write('')
        for seq in fasta.values():
            open(tmpInput, 'a').write("%s\n" % seq)

        os.system(
            "python2.7 %s %s > %s" %
            (command, tmpInput,
             os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)))

        output = ['ID\tImmunogenicity_Score']
        values = {}
        for row in open(
                os.path.join(rawOutput, "%s.output" %
                             Path(inputFasta).stem)).read().split('\n')[4:]:
            if len(row) == 0:
                continue
            tokens = row.split(',')
            seq = tokens[0]
            fastaID = list(fasta.keys())[list(fasta.values()).index(seq)]
            values[fastaID] = tokens[2]
        for fastaID in sequenceIDs:
            if fastaID in values.keys():
                output.append('\t'.join([fastaID, values[fastaID]]))
            else:
                output.append(fastaID + '\t0.0')
        open(os.path.join(rawOutput, "%s.imgen.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
        if not rawFlag:
            os.remove(
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
        os.remove(tmpInput)
Esempio n. 5
0
    def makeInput(self, inputFasta, outputDir, organism, incFeatures):
        featureDir = os.path.join(outputDir, "_FEATURE")
        if organism.lower() in ["gram+", "g+", "gram-", "g-"]:
            masterLabels = ["ID", "Gram"]
            masterData = {}

            sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")
            if organism.lower() in ["gram+", "g+"]:
                for fastaID in sequenceIDs:
                    masterData[fastaID] = [fastaID, "1"]
                for method in incFeatures:
                    tsvFile = os.path.join(
                        featureDir, method.upper(),
                        "%s.%s.tsv" % (Path(inputFasta).stem, method))
                    for (i,
                         line) in enumerate(open(tsvFile).read().splitlines()):
                        tokens = line.split('\t')
                        if i == 0:
                            if method == "psortb":
                                masterLabels += tokens[2:]
                            else:
                                masterLabels += tokens[1:]
                        else:
                            if method == "psortb":
                                masterData[tokens[0]] += tokens[2:]
                            else:
                                masterData[tokens[0]] += tokens[1:]
            elif organism.lower() in ["gram-", "g-"]:
                for fastaID in sequenceIDs:
                    masterData[fastaID] = [fastaID, "0"]
                for method in incFeatures:
                    tsvFile = os.path.join(
                        featureDir, method.upper(),
                        "%s.%s.tsv" % (Path(inputFasta).stem, method))
                    for (i,
                         line) in enumerate(open(tsvFile).read().splitlines()):
                        tokens = line.split('\t')
                        if i == 0:
                            if method == "psortb":
                                masterLabels += tokens[2:]
                            else:
                                masterLabels += tokens[1:]
                        else:
                            if method == "psortb":
                                masterData[tokens[0]] += tokens[2:]
                            else:
                                masterData[tokens[0]] += tokens[1:]
        else:
            masterLabels = ["ID"]
            masterData = {}

            sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")
            for fastaID in sequenceIDs:
                masterData[fastaID] = [fastaID]
            for method in incFeatures:
                tsvFile = os.path.join(
                    featureDir, method.upper(),
                    "%s.%s.tsv" % (Path(inputFasta).stem, method))
                for (i, line) in enumerate(open(tsvFile).read().splitlines()):
                    tokens = line.split('\t')
                    if i == 0:
                        if method == "psortb":
                            masterLabels += tokens[2:]
                        else:
                            masterLabels += tokens[1:]
                    else:
                        if method == "psortb":
                            masterData[tokens[0]] += tokens[2:]
                        else:
                            masterData[tokens[0]] += tokens[1:]

        output = ["\t".join(masterLabels)]
        for fastaID in masterData.keys():
            output.append("\t".join(masterData[fastaID]))
        open(os.path.join(outputDir, "%s.input.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
Esempio n. 6
0
    def run_descriptor(inputFasta, outputDir, rawFlag):
        rawOutput = os.path.join(outputDir, "MDESC")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        fasta = readfasta.readFasta(inputFasta, key="full")
        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        aacomp = []
        ctd = []
        seqord = []
        autocor = []
        output = []
        for fastaID in sequenceIDs:
            sequence = fasta[fastaID]
            Des = GetProDes(sequence)
            features = {}
            dict.update(features, Des.GetAAComp())
            keys = list(features.keys())
            if len(aacomp) == 0:
                aacomp.append('\t'.join(['ID'] + keys))
                output.append('\t'.join(['ID'] + keys))
            tmp = []
            for key in keys:
                tmp.append(str(features[key]))
            aacomp.append('\t'.join([fastaID] + tmp))
            output.append('\t'.join([fastaID] + tmp))

            features = Des.GetCTD()
            keys = list(features.keys())
            if len(ctd) == 0:
                ctd.append('\t'.join(['ID'] + keys))
                output[0] += '\t%s' % '\t'.join(keys)
            tmp = []
            for key in keys:
                tmp.append(str(features[key]))
            ctd.append('\t'.join([fastaID] + tmp))
            output[-1] += '\t%s' % '\t'.join(tmp)

            features = {}
            dict.update(features, Des.GetQSO(maxlag=MIN_PEPTIDE_LENGTH))
            keys = list(features.keys())
            if len(seqord) == 0:
                seqord.append('\t'.join(['ID'] + keys))
                output[0] += '\t%s' % '\t'.join(keys)
            tmp = []
            for key in keys:
                tmp.append(str(features[key]))
            seqord.append('\t'.join([fastaID] + tmp))
            output[-1] += '\t%s' % '\t'.join(tmp)

            features = {}
            dict.update(features,
                        Des.GetMoreauBrotoAuto(maxlag=MIN_PEPTIDE_LENGTH))
            dict.update(features, Des.GetGearyAuto(maxlag=MIN_PEPTIDE_LENGTH))
            keys = list(features.keys())
            if len(autocor) == 0:
                autocor.append('\t'.join(['ID'] + keys))
                output[0] += '\t%s' % '\t'.join(keys)
            tmp = []
            for key in keys:
                tmp.append(str(features[key]))
            autocor.append('\t'.join([fastaID] + tmp))
            output[-1] += '\t%s' % '\t'.join(tmp)

        if rawFlag:
            open(
                os.path.join(rawOutput,
                             "%s.aacomp.tsv" % Path(inputFasta).stem),
                'w').write('\n'.join(aacomp))
            open(os.path.join(rawOutput, "%s.ctd.tsv" % Path(inputFasta).stem),
                 'w').write('\n'.join(ctd))
            open(
                os.path.join(rawOutput,
                             "%s.seqord.tsv" % Path(inputFasta).stem),
                'w').write('\n'.join(seqord))
            open(
                os.path.join(rawOutput,
                             "%s.autocor.tsv" % Path(inputFasta).stem),
                'w').write('\n'.join(autocor))
        open(os.path.join(rawOutput, "%s.mdesc.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))

        for fastaID in sequenceIDs:
            sequence = fasta[fastaID]
Esempio n. 7
0
    def run_tmhmm(inputFasta, outputDir, multiFlag, process, rawFlag):
        command = TMHMM_PATH
        rawOutput = os.path.join(outputDir, "TMHMM")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        if multiFlag.lower() in ['t', 'true'
                                 ] and len(sequenceIDs) > SPLIT_LIMIT:
            (inFiles, outFiles) = Feature.split_files(inputFasta, rawOutput)
        else:
            inFiles = [inputFasta]
            outFiles = [
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)
            ]

        clines = []
        for (f, inFile) in enumerate(inFiles):
            outFile = outFiles[f]
            cline = "%s %s > %s" % (command, inFile, outFile)
            clines.append(cline)
        Feature.mp_run(clines, process)

        if multiFlag.lower() in ['t', 'true'
                                 ] and len(sequenceIDs) > SPLIT_LIMIT:
            Feature.combine_and_cleanup_files(
                inFiles, outFiles,
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))

        fasta = readfasta.readFasta(inputFasta, key="full", strip=False)
        sequenceIDs = {}
        for fastaID in fasta.keys():
            shortID = fastaID.strip().strip('>').split(' ')[0]
            sequenceIDs[shortID] = fastaID.strip().strip('>')
        values = {}
        for row in open(
                os.path.join(rawOutput, "%s.output" %
                             Path(inputFasta).stem)).read().split('\n'):
            if row.startswith('#'):
                tokens = row.split(' ')
                if tokens[1] not in values.keys():
                    values[tokens[1]] = [""] * 4
                if "Number of predicted TMHs:" in row:
                    values[tokens[1]][0] = tokens[-1].strip()
                if "Exp number of AAs in TMHs:" in row:
                    values[tokens[1]][1] = tokens[-1].strip()
                if "Exp number, first 60 AAs:" in row:
                    values[tokens[1]][2] = tokens[-1].strip()
                if "Total prob of N-in:" in row:
                    values[tokens[1]][3] = tokens[-1].strip()

        output = [
            "ID\tPredicted_TMH#\tExp_AAs#\tExp_first_60_AAs#\tTotal_N-in_prob"
        ]
        for shortID in sequenceIDs.keys():
            if shortID in values.keys():
                output.append('\t'.join([sequenceIDs[shortID]] +
                                        values[shortID]))
            else:
                output.append('\t'.join([sequenceIDs[shortID]] + [""] * 4))
        open(os.path.join(rawOutput, "%s.tmhmm.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
        if not rawFlag:
            os.remove(
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
        for tmpFile in glob.glob(os.path.join(os.getcwd(), "TMHMM_*")):
            shutil.rmtree(tmpFile)
Esempio n. 8
0
    def run_signalp(inputFasta, outputDir, organism, multiFlag, process,
                    rawFlag):
        command = SIGNALP_PATH
        rawOutput = os.path.join(outputDir, "SIGNALP")
        if not os.path.exists(rawOutput):
            os.mkdir(rawOutput)

        sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")

        if multiFlag.lower() in ['t', 'true'
                                 ] and len(sequenceIDs) > SPLIT_LIMIT:
            (inFiles, outFiles) = Feature.split_files(inputFasta, rawOutput)
        else:
            inFiles = [inputFasta]
            outFiles = [
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)
            ]

        clines = []
        for (f, inFile) in enumerate(inFiles):
            outFile = outFiles[f]
            if organism.lower() in ["gram+", "g+"]:
                cline = "%s -t gram+ %s > %s" % (command, inFile, outFile)
            elif organism.lower() in ["gram-", "g-"]:
                cline = "%s -t gram- %s > %s" % (command, inFile, outFile)
            clines.append(cline)
        Feature.mp_run(clines, process)

        if multiFlag.lower() in ['t', 'true'
                                 ] and len(sequenceIDs) > SPLIT_LIMIT:
            Feature.combine_and_cleanup_files(
                inFiles, outFiles,
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))

        fasta = readfasta.readFasta(inputFasta, key="full", strip=False)
        sequenceIDs = {}
        for fastaID in fasta.keys():
            shortID = fastaID.strip().strip('>').split(' ')[0]
            sequenceIDs[shortID] = fastaID.strip().strip('>')
        values = {}
        for (i, row) in enumerate(
                open(
                    os.path.join(rawOutput, "%s.output" %
                                 Path(inputFasta).stem)).readlines()):
            if row == '' or row.startswith('#'):
                continue
            tokens = re.split('[ ]+', row)
            if tokens[0] in values.keys():
                print("Duplicate SingalP Result: %s" % tokens[0])
            values[tokens[0]] = tokens[8]

        output = ["ID\tSignalP_DScore"]
        for shortID in sequenceIDs.keys():
            if shortID in values.keys():
                output.append('\t'.join(
                    [sequenceIDs[shortID], values[shortID]]))
            else:
                output.append('\t'.join([sequenceIDs[shortID], '0.000']))
        open(os.path.join(rawOutput, "%s.signalp.tsv" % Path(inputFasta).stem),
             'w').write('\n'.join(output))
        if not rawFlag:
            os.remove(
                os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
Esempio n. 9
0
    def train(self, positiveFasta, negativeFasta, outputDir, organism,
              incFeatures, multiFlag, process):
        featureDir = os.path.join(outputDir, "_FEATURE")
        masterLabels = ["ID", "Label", "Gram"]
        masterData = {}

        for (g, inputFasta) in enumerate([negativeFasta, positiveFasta]):
            sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full")
            if organism.lower() in ["gram+", "g+"]:
                for fastaID in sequenceIDs:
                    masterData[fastaID] = [fastaID, str(g), "1"]
                for method in incFeatures:
                    tsvFile = os.path.join(
                        featureDir, method.upper(),
                        "%s.%s.tsv" % (Path(inputFasta).stem, method))
                    for (i,
                         line) in enumerate(open(tsvFile).read().splitlines()):
                        tokens = line.split('\t')
                        if i == 0:
                            if method == "psortb":
                                masterLabels += tokens[2:]
                            else:
                                masterLabels += tokens[1:]
                        else:
                            if method == "psortb":
                                masterData[tokens[0]] += tokens[2:]
                            else:
                                masterData[tokens[0]] += tokens[1:]
            elif organism.lower() in ["gram-", "g-"]:
                for fastaID in sequenceIDs:
                    masterData[fastaID] = [fastaID, str(g), "0"]
                for method in incFeatures:
                    tsvFile = os.path.join(
                        featureDir, method.upper(),
                        "%s.%s.tsv" % (Path(inputFasta).stem, method))
                    for (i,
                         line) in enumerate(open(tsvFile).read().splitlines()):
                        tokens = line.split('\t')
                        if i == 0:
                            if method == "psortb":
                                masterLabels += tokens[2:]
                            else:
                                masterLabels += tokens[1:]
                        else:
                            if method == "psortb":
                                masterData[tokens[0]] += tokens[2:]
                            else:
                                masterData[tokens[0]] += tokens[1:]

        rows = []
        samples = []
        groups = []
        for fastaID in masterData.keys():
            tokens = masterData[fastaID]
            samples.append(fastaID)
            groups.append(int(tokens[1]))
            rows.append(tokens[2:])
        X = np.array(rows)
        X = X.astype(float)
        y = np.array(groups)

        scaler = MinMaxScaler(copy=False)
        scaler.fit(X)
        joblib.dump(scaler, os.path.join(outputDir, "Scaler.sav"))

        X = scaler.transform(X)
        est = XGBClassifier(objective='binary:logistic',
                            silent=True,
                            nthread=1,
                            eval_metric='auc',
                            random_state=26)

        estPipe = Pipeline([('feature_selection', SelectKBest(mRMR)),
                            ('classification', est)])
        grid = [{
            "feature_selection__k": list(range(20, X.shape[1], 20))[:10],
            'classification__learning_rate': [0.3, 0.1],
            'classification__n_estimators': [60, 80, 100, 120, 140, 160],
            'classification__max_depth': [3, 6, 9],
            'classification__min_child_weight': [1, 3],
            'classification__scale_pos_weight': [1, 6],
            'classification__max_delta_step': [0, 3],
        }]

        cv = StratifiedShuffleSplit(n_splits=3, random_state=6)
        if multiFlag.lower() not in ['t', 'true']:
            xgb = GridSearchCV(estimator=estPipe,
                               param_grid=grid,
                               cv=cv,
                               iid=False,
                               verbose=1,
                               n_jobs=1)
        else:
            xgb = GridSearchCV(estimator=estPipe,
                               param_grid=grid,
                               cv=cv,
                               iid=False,
                               verbose=1,
                               n_jobs=process)
        xgb.fit(X, y)
        joblib.dump(xgb, os.path.join(outputDir, "VaxignML.sav"))

        y_prob = xgb.predict_proba(X)
        joblib.dump(y_prob, os.path.join(outputDir, "VaxignML.scores"))