Esempio n. 1
0
    def __init__(self):
        self.config = Utils.loadConfig()
        self.task = self.config.get('eval', 'task')
        self.gold = self.config.get('eval', 'goldID.path')
        self.result = self.config.get('eval', 'result.path')
        self.threshold = float(self.config.get('eval', 'threshold'))
        self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter'))

        self.Similarity = Similarity.Similarity(self.config)
        self.Filter = Filter.Filter(self.config,
                                    sparkContext=self.sparkContext)
        self.Merger = Merger.Merger(self.config)

        self.goldIDs = Utils.readFileLines(self.gold)[1:]
        self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test')

        # total nb of gold genes
        self.nbGoldGenes = len(self.goldIDs)
        # total nb of gold clusters
        self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0)
        self.goldGenes = [
            gene for genes in self.foldedGold.values() for gene in genes
        ]
        self.nbGoldClusters = len(self.foldedGold)
        self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n'
        self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
Esempio n. 2
0
    def summarize(self):
        metricFiles = Utils.listFilesExt(self.result, 'metrics')
        metricFiles = sorted(metricFiles)
        output, pos = "", ""
        outputFile = Utils.normalizePath(self.result) + "results.summary"
        if ("pos" in self.result):
            pos = self.result.split("pos")[1][0:2]

        for file in metricFiles:
            metrics = Utils.readFileLines(file)[2].replace("pos\t", "")
            filename = os.path.basename(file)
            classifier = filename.split("_")[0]
            feats = filename.split("_")[1] + "+" + filename.split("_")[2]
            len = filename.split("len")[1].split("_")[0]
            overlap = filename.split("overlap")[1].split("_")[0][0:2]

            evaltype = filename.split("IDs.test.")[1].replace(
                "eval.metrics", "").replace(".", "")
            if (not evaltype):
                evaltype = "succ0"
            if ("similar" in evaltype):
                evaltype = evaltype.replace("similar", "sim")
            if ("merge" in evaltype):
                evaltype = evaltype.replace("succ", "")

            line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n"
            output += line
        Utils.writeFile(outputFile, output)
Esempio n. 3
0
    def createSimilarityMatrix(self):
        source_type = self.config.get('dataPipeline', 'source.type')
        list = Utils.listFilesExt(self.source_path, "fasta")
        outputFile = self.result_path + '/similarity.blast'
        outputRedFile = self.result_path + '/similarity.blast.similarity'
        similarity = ""
        columns = [
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
            'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovs'
        ]

        if (not os.path.isfile(outputFile)):

            # generate all gene pairs within a genome
            allpairs = {(i, j) for i in list for j in list}
            # filter out duplicate pairs, e.g. (2,8) and (8,2)
            file_content = set(tuple(sorted(p)) for p in allpairs)

            datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                                 source_path=self.source_path,
                                                 result_path=self.result_path)

            sparkContext = SparkContext(
                conf=datapipe.initSpark("blastSimilarity"))
            similarity = datapipe.getBLAST(file_content,
                                           sparkContext,
                                           blastTask="similarity")

            result = ""
            for entry in similarity:
                if (entry[1]):
                    result += entry[1] + "\n"

            Utils.writeFile(outputFile, result)
            df = pandas.read_csv(StringIO(result),
                                 sep='\t',
                                 names=columns,
                                 index_col=False)

        else:
            df = pandas.read_csv(outputFile,
                                 sep='\t',
                                 names=columns,
                                 index_col=False)

        # generate leaner matrix with only selected columns,
        # output to new file
        if (not os.path.isfile(outputRedFile)):
            df = df[['qseqid', 'sseqid', 'pident', 'bitscore', 'qcovs']]
            df['id'] = df[['qseqid', 'sseqid']].agg('|'.join, axis=1)
            df.drop('qseqid', 1)
            df.drop('sseqid', 1)
            df = df[['id', 'pident', 'bitscore', 'qcovs']]
            df = df.sort_values('id')
            df.to_csv(sep='\t',
                      header=True,
                      path_or_buf=outputRedFile,
                      index=False)

        print('done!')
Esempio n. 4
0
    def runCrossValid(self, x_occ, y_labels, IDs):
        seed = 5
        np.random.seed(seed)
        i = 1

        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train, valid in kfold.split(x_occ, y_labels):
            # get position of features inexistent on train, remove such feats from valid

            # gives which indexes are greater than 0
            filter = np.where(np.sum(x_occ[train], axis=0) > 0)[0]

            # takes only column indices from *filter*
            x_occT = np.take(x_occ[train], filter, axis=1)
            x_occV = np.take(x_occ[valid], filter, axis=1)

            self.classifier.fit(x_occT, y_labels[train])

            output, IDoutput = self.getPredictions(IDs[valid], x_occV,
                                                   y_labels[valid])
            Utils.writeFile(self.outFile + 'f' + str(i) + '.valid', output)
            Utils.writeFile(self.outFile + 'f' + str(i) + '.IDs.valid',
                            IDoutput)
            i += 1

        self.classifier = self.setUpClassifier()
        self.classifier.fit(x_occ, y_labels)

        joblib.dump(self.classifier, self.modelFile)
        print('Model saved.')
Esempio n. 5
0
def parseFastaToList(path, filter):
    thislist, files, filterIDs, filename_content = [], [], [], []

    if (os.path.isfile(path)):
        files.append(path)
    else:
        files = Utils.listFilesExt(path, 'fasta')

    if (os.path.isfile(filter)):
        filterIDs = Utils.readFileLines(filter)
    else:
        filterIDs = filter.split('\n')

    for file in files:
        sequences = parseFasta(file)
        for fasta_record in sequences:
            output = '>' + str(fasta_record.id) + '\n' + str(fasta_record.seq)
            if (len(filterIDs) > 0):
                if (str(fasta_record.id)) not in str(filterIDs):
                    thislist.append(output)
                    filename_content.append(tuple([file, output]))
            else:
                thislist.append(output)
                filename_content.append(tuple([file, output]))

    return thislist, filename_content
Esempio n. 6
0
    def runGridSearch(self, x_occ, y_labels):
        output = 'Running grid search for ' + self.classif + ' in ' + str(
            len(x_occ)) + ' instances ...\n'
        print('Running grid search for', self.classif, 'in', str(len(x_occ)),
              'instances ...\n')
        scores = ['f1', 'precision', 'recall']

        for score in scores:
            output += 'Grid search for score: ---> ' + score + ' <---\n'

            classif = GridSearchCV(estimator=self.setUpClassifier(),
                                   param_grid=self.getGridParams(),
                                   scoring=score,
                                   cv=5,
                                   n_jobs=60)
            classif.fit(x_occ, y_labels)
            output += 'Best parameters in train set:\n'
            output += str(classif.best_params_) + '\n'
            output += 'Grid scores in train set:\n'
            means = classif.cv_results_['mean_test_score']
            stds = classif.cv_results_['std_test_score']

            for mean, std, params in zip(means, stds,
                                         classif.cv_results_['params']):
                params = str(params).replace('{', '').replace('}', '')
                output += ("%0.3f (+/-%0.03f) |\t params %r" %
                           (mean, std * 2, params)) + '\n'

            output += "\n--------------------------------------------------\n"
            print('Done with', score, '.')

        Utils.writeFile(self.outputPath + self.classif + '.gridSearch', output)
        print(output)
Esempio n. 7
0
    def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType,
                             rewardType):

        rewardperfeat = {}
        # tuple of shape {(file, content, id),'kmers'}
        resultLabel = Parsers.parseDatasetContents(dataPath, featType,
                                                   sourceType)
        fileindex = list(set([i[0][0] for i in resultLabel]))

        for item in resultLabel:
            filename = item[0][0]
            label = Utils.getLabel(filename)
            content = item[0][1]
            idx = fileindex.index(filename)
            occ = 1 if label == 1 else -1

            if (content in rewardperfeat):
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ
            else:
                rewardperfeat[content] = [0] * len(fileindex)
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ

        outputstr = ''
        for k, v in rewardperfeat.items():
            outputstr += k + '\t' + (',').join(map(str, v)) + '\n'
        Utils.writeFile(outputPath, outputstr[:-1])

        return rewardperfeat
Esempio n. 8
0
def parseDatasetContents(dataPath, featType, sourceType):
    files, result = [], []
    if ('domain' in featType or 'dictionary' in featType):
        domainFiles = Utils.listFilesExt(dataPath, 'domains')
        files += domainFiles
        if (len(domainFiles) < 1):
            print('No domains / dictionary files found in', dataPath)
            exit()

    if ('kmers' in featType or 'prot' in featType):
        fastaFiles = Utils.listFilesExt(dataPath, 'fasta')
        files += fastaFiles
        if (len(fastaFiles) < 1):
            print('No fasta files found in', dataPath)
            exit()

    if ('go' in featType):
        goTermFiles = Utils.listFilesExt(dataPath, 'go')
        files += goTermFiles
        if (len(goTermFiles) < 1):
            print('No GO term files found in', dataPath)
            exit()

    for file in files:
        ext = os.path.splitext(file)[1]
        lines = Utils.readFileLines(file)
        #handle genes with an added version number as NRRL3_00129.1
        id = lines[0].replace('>', '').replace('a', '').split('.')[0]
        if ('fasta' in ext):
            #content = Utils.readFileLines(file)[1].upper()
            content = lines[1].upper()
            content = normalizeSequence(content, sourceType)
            if ('kmers' in featType):
                result.append(((file, content, id), 'kmers'))
            if ('prot' in featType):
                result.append(((file, content, id), 'protanalys'))

        elif ('domain' in ext):
            #content = Utils.readFileLines(file)[1:]
            content = lines[1:]
            # Comment out next line to keep domain name:
            content = [line.split('.')[0] for line in content]
            content = "\n".join(content)

            if ('pfam' in featType):
                temp = content.split('\n')
                for entry in temp:
                    result.append(((file, entry, id), 'domains'))
            else:
                if (content):
                    result.append(((file, content, id), 'domains'))

        elif ('go' in ext):
            #content = Utils.readFileLines(file)[1:]
            content = lines[1:]
            content = "\n".join(content)
            result.append(((file, content, id), 'go'))

    return result
Esempio n. 9
0
 def outputStats(self):
     stat = ['id\trewardKeep\trewardSkip\tqvalueKeep\tqvalueSkip']
     for i, id in enumerate(self.rewardIDs):
         stat.append(id + '\t' + str(self.rewardTable[i][0]) + '\t' +
                     str(self.rewardTable[i][1]) + '\t' +
                     str(self.QTable[i][0]) + '\t' +
                     str(self.QTable[i][1]))  #
     Utils.writeFile(self.outputPath + self.params + '.stat',
                     '\n'.join(stat))
     print('Stats saved.')
Esempio n. 10
0
    def train(self, sparkContext, outputStats):
        dataset = self.prepareData(self.trainPath, sparkContext)
        penalties = []
        logging = ['Episode\tPenalty']

        for ep in range(0, self.episodes):
            totalStates = 0
            penalty = 0
            for i, entry in enumerate(dataset):
                # check reward per cluster according to table
                states = entry[1]  # domains only
                actionType = ''
                totalStates += len(states)
                #while not done:
                for j, state in enumerate(states):
                    state = state.split('.')[0]
                    stateIdx = self.rewardIDs.index(state)

                    if (random.uniform(0, 1) < self.epsilon):
                        actionType = 'explore'
                        action = random.choice(self.actions)
                        action = self.actions.index(action)
                    else:
                        action = np.argmax(self.QTable[stateIdx])
                        actionType = 'exploit'

                    reward = self.rewardTable[stateIdx, action]

                    # check if last state in the cluster
                    if (j + 1 < len(states)):
                        nextState = states[j + 1]
                    else:
                        nextState = states[j]
                    nextStateIdx = self.rewardIDs.index(nextState)

                    oldQValue = self.QTable[stateIdx, action]
                    nextMax = np.max(self.QTable[nextStateIdx])

                    newQValue = oldQValue + self.alpha * (
                        reward + self.gamma * nextMax - oldQValue)
                    self.QTable[stateIdx, action] = newQValue

                    if (reward <
                            self.penaltyThreshold):  # better define penalties
                        penalty += 1

            penalties.append(penalty)

        np.save(self.QTablePath, self.QTable)
        np.save(self.rewardTablePath, self.rewardTable)
        Utils.writeFile(self.IDmapPath, '\n'.join(self.rewardIDs))

        self.outputStats() if outputStats else ''

        print('Done training!')
Esempio n. 11
0
def genBankToAminoacid(path):
    list = []
    # only aminoacid sequence
    translations = ''
    files = []
    if (os.path.isfile(path)):
        files.append(path)
    else:
        files = Utils.listFilesExt(path, 'gbk')

    for file in files:
        species = Utils.getSpecies(file)
        records = parseGenBank(file)

        for record in records:
            locus = record.id
            for feature in record.features:
                #if feature.key == "CDS":
                if feature.type == "CDS":
                    id, locus_tag, gene, protein_id, translation, \
                    product, function, description  = '','','','','','','',''

                    for key, value in feature.qualifiers.items():
                        # get rid of the quotes around the qualifier
                        # find entry ID
                        if key == "translation":
                            translation = value[0]
                        elif key == "gene":
                            gene = value[0]
                        elif key == "locus_tag":
                            locus_tag = value[0]
                        elif key == "protein_id":
                            protein_id = value[0]
                            protein_id = protein_id.replace('/', '')
                        elif key == "product":
                            product = value[0]
                        elif key == "function":
                            function = value[0]

                    #priority for gene ID
                    id = locus_tag if not id and len(locus_tag) > 1 else id
                    id = gene if not id and len(gene) > 1 else id
                    id = protein_id if not id and len(protein_id) > 1 else id

                    description = product if product.strip() else description
                    description += '|' + function if function.strip(
                    ) else description

                    entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation
                    if (entry not in list):
                        list.append(entry)
                        translations += translation

    return list, translations
Esempio n. 12
0
    def getRFImportance(self):
        pd.options.display.float_format = '{:,.8f}'.format
        importance = self.classifier.feature_importances_
        features = self.extractor.loadFeatures()

        feature_importances = pd.DataFrame(importance,
                                           index=features,
                                           columns=['importance']).sort_values(
                                               'importance', ascending=False)

        Utils.writeFile(self.extractor.featFile + 'importance',
                        feature_importances.to_string())
Esempio n. 13
0
 def __init__(self):
     self.config = Utils.loadConfig()
     self.path = self.config.get('prediction', 'source.path')
     self.path = Utils.normalizePath(self.path)
     self.trainPath = self.path + 'train/'
     self.testPath = self.path + 'test/'
     self.outputPath = self.path + 'metricsQLearner/models/'
     self.geneMapPath = self.config.get('eval', 'filter.map')
     self.geneMap = {}
     self.extractor = Extractor.Extractor(self.config, self.outputPath)
     self.rewardType = 'occ'
     self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat'  # pfam domain list
     self.rewardList, self.rewardIDs, self.rewardLabels = '', '', ''
     self.actions = ['keep', 'skip']
     self.task = 'train'
     self.rewardTable, self.QTable = [], []
     self.episodes = int(self.config.get('prediction', 'episodes'))
     # hyperparams
     self.alpha = float(self.config.get('prediction',
                                        'alpha'))  # learning rate
     self.gamma = float(self.config.get('prediction',
                                        'gamma'))  # discount factor
     self.epsilon = float(self.config.get('prediction',
                                          'epsilon'))  # exploration
     self.penaltyThreshold = float(
         self.config.get(
             'prediction',
             'penalty.threshold'))  # negative rewards mean penalty
     self.keepskipThreshold = float(
         self.config.get('prediction', 'keepskip.threshold')
     )  # keep reward ratio wrt skip reward for domain to be kept
     self.useSimilarityWeight = False
     self.useCompWeights = False
     self.useNeighborWeight = self.config.getboolean(
         'prediction', 'neighbor.weight')
     self.useDryIslands = self.config.getboolean('prediction',
                                                 'dry.islands')
     self.useAvAction = self.config.getboolean('prediction',
                                               'average.action')
     self.weightsPath = self.config.get('eval', 'weights')
     self.weights = Utils.readFileLines(
         self.weightsPath
     ) if self.useCompWeights or self.useNeighborWeight else ''
     self.params = self.rewardType + '_keepgt' + str(
         self.keepskipThreshold) + 'skip' + '_ep' + str(
             self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str(
                 self.gamma) + '_eps' + str(self.epsilon)
     self.params += '_neighbor' if self.useCompWeights else ''
     self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy'
     self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy'
     self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
Esempio n. 14
0
    def __init__(self, blastTask):
        self.config = Utils.loadConfig()
        self.sourceType = self.config.get('dataPipeline', 'source.type')
        self.blastTask = blastTask
        self.blastdb = self.config.get('blaster', 'blastdb.path')
        self.blastdb = Utils.normalizePath(self.blastdb)
        self.blastdbName = self.config.get('blaster', 'blastdb.name')
        if(not self.blastdbName.endswith('fasta')):
            self.blastdbName += '.fasta'

        self.goTerms = True if 'goterm' in blastTask.lower() else False
        self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab')
        self.mapping = ''
        if(self.goTerms):
            self.mapping = self.loadMapping()
Esempio n. 15
0
    def getDomains(self, sparkContext):

        # recover the species name for using in temp files
        self.species = Utils.getSpecies(self.source_path)
        domainFinder = DomainFinder.DomainFinder()

        # load source sequences into a single list
        if ("fasta" in self.source_type):
            list, file_content = Parsers.parseFastaToList(self.source_path, "")
        elif ("genbank" in self.source_type):
            list = Parsers.genBankToAminoacid(self.source_path)

        print('Processing domains...')

        # create RDD with source sequences
        sourceRDD = sparkContext.parallelize(file_content, numSlices=2000)

        if ("nucleotide" in self.source_type):
            # execute sixFrame translation for each sequence in RDD
            sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x))

        # execute Pfam domain prediction for each sixFrame translation in RDD
        domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1]))
        processedRDD = domainsRDD.map(
            lambda x: self.processDomainOutput(x[0], x[1]))

        # recover Pfam domain prediction results from RDD
        result = processedRDD.collectAsMap()

        print('Done!')

        return result
Esempio n. 16
0
    def computeMajorityVote(self, predictedClusters, thresholdPerc,
                            thresholdLen):
        predictPerGene, majority, geneLengths = {}, {}, {}
        majorityClusters = []
        threshold = thresholdPerc / 100
        lengths = Utils.readFileLines(self.geneLengthPath)

        for i in lengths:
            temp = i.split('\t')
            geneLengths[temp[0].split('.')[0]] = int(temp[1])

        # get number of times gene appears, and with which label
        for cluster in predictedClusters:
            clustergenes = cluster.replace('||', '|').replace('|\t',
                                                              '\t').split('\t')
            label = 1 if float(clustergenes[1]) >= self.threshold else 0
            genes = clustergenes[0].split('|')
            for gene in genes:
                gene = gene.split('.')[0]
                if (gene in predictPerGene.keys()):
                    predictPerGene[gene].append(label)
                else:
                    if (gene):
                        predictPerGene[gene] = [label]

        # compute score for each gene regarding the number of times it appears
        for gene, label in predictPerGene.items():
            majority[gene] = sum(label) / len(label)

        # concatenate genes scoring above threshold
        tempcluster, tempscore, templength = '', 0, 0
        for gene, score in majority.items():
            if (score >= threshold):
                if (tempscore >= threshold):
                    tempcluster += gene + '|'
                    templength += geneLengths.get(gene)
                else:
                    majorityClusters.append(tempcluster[:-1] + '\t' + '0')
                    tempcluster = gene + '|'
                    templength = geneLengths.get(gene)
            else:
                if (tempscore < threshold):
                    # concatenate until around threshold length for negatives
                    if (templength < thresholdLen):
                        tempcluster += gene + '|'
                        templength += geneLengths.get(gene)
                    else:
                        majorityClusters.append(tempcluster[:-1] + '\t' + '0')
                        tempcluster = gene + '|'
                        templength = geneLengths.get(gene)
                else:
                    majorityClusters.append(tempcluster[:-1] + '\t' + '1')
                    tempcluster = gene + '|'
                    templength = geneLengths.get(gene)

            tempscore = score

        print('Done computing majority vote.')
        return majorityClusters
Esempio n. 17
0
    def createNegShuffle(self, posPerc):
        files = Utils.listFilesExt(self.source_path, self.ext)
        negPerc = 100 - posPerc
        positives = len(files)
        negativeSize = int((negPerc * positives) / posPerc)
        print('Negative percentage: ' + str(negPerc) + '% \n' +
              'Negative instances: ' + str(negativeSize) + '\n' +
              'Positive percentage: ' + str(posPerc) + '% \n' +
              'Positive instances: ' + str(positives) + '\n' +
              'Total corpus size: ' + str(negativeSize + positives))

        thisDecRatio = 0.0
        count = 0
        ratio = (negativeSize / positives)
        decRatio = ratio - int(ratio)

        print('Generating...')
        for file in files:
            # add up the decimal ratio part
            thisDecRatio += round(decRatio, 2)
            # reset range
            ratioRange = int(negativeSize / positives)

            # check if decimal ratio added up to a duplicate
            if (thisDecRatio >= 1):
                ratioRange = int(ratio + thisDecRatio)
                thisDecRatio = 0

            for i in range(0, ratioRange):
                name = os.path.basename(file)
                result_file = name.split('.')[0] + '_' + str(
                    i) + '.shuffled.negative.fasta'

                if ('nuc' in self.seqType):
                    content = Parsers.genBankToNucleotide(file)
                if ('amino' in self.seqType):
                    list, content = Parsers.genBankToAminoacid(file)
                content = Utils.charGramShuffle(content, 2)
                content = '>' + name + '\n' + content

                count += 1

                Utils.writeFile(self.result_path + result_file, content)

        print('Total generated: ' + str(count) + '. Done!')
Esempio n. 18
0
    def addInstance(self, info):
        # receives tuple in format:
        # (fileID, [list of extracted features])
        # for pfam only (inverted):
        # (pfamID, [list of fileIDs])
        instanceValues = list()

        label = Utils.getLabel(''.join(info[1])) if 'pfam' in self.featType else Utils.getLabel(info[0])
        fileID = info[0]
        features = [info[0]] if 'pfam' in self.featType else info[1]

        for feat in features:
            instanceValues.append(self.dictionary.get(feat)) if feat in self.dictionary else ""

        size = len(instanceValues)
        instanceID = (fileID, int(size), str(label))

        return instanceID, instanceValues
Esempio n. 19
0
    def __init__(self):
        # read application configuration props
        self.config = Utils.loadConfig()
        self.path = self.config.get('prediction', 'source.path')
        self.path = Utils.normalizePath(self.path)
        self.trainPath = self.path + 'train/'
        self.validPath = self.path + 'validation/'
        self.gridCVPath = self.path + 'train_validation/'
        self.testPath = self.path + 'test/'
        self.outputPath = self.path + 'metrics/cv_gridsearchparams/'
        self.task = self.config.get('prediction', 'task')
        self.posPerc = int(self.config.get('prediction', 'pos.perc'))
        self.classif = self.config.get('prediction', 'classifier')
        os.makedirs(os.path.dirname(self.outputPath), exist_ok=True)
        self.extractor = Extractor.Extractor(self.config, self.outputPath)
        self.loader = Loader.Loader(self.config, self.outputPath)
        self.dimHandler = DimensionHandler.DimensionHandler(
            self.config, self.outputPath)
        self.outFile = ''
        self.useEmbeddings = self.config.getboolean('prediction',
                                                    'use.embeddings')
        self.cv = self.config.getboolean('prediction', 'use.crossvalid')
        if ('cross' in self.task):
            self.cv = True
        if (not 'none' in self.dimHandler.name.lower()):
            self.outFile = self.dimHandler.getOutFile(self.classif)
            self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile
        else:
            self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType
            if ('kmers' in self.extractor.featType):
                kmerfeats = 'kmers' + str(
                    self.extractor.size) + '_minOcc' + str(
                        self.extractor.minOcc)
                self.outFile = self.outFile.replace('kmers', kmerfeats)
                #self.outFile +=  str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc)
        if ('cross' in self.task or 'grid' in self.task or self.cv):
            self.extractor.featFile = self.extractor.featFile.replace(
                '.feat', '.complete.feat'
            ) if 'grid' in self.task else self.extractor.featFile
            if ('cross' in self.task or self.cv):
                self.outFile += '_cv05'

        self.modelFile = self.outFile + '.model.pkl'
        self.classifier = self.setUpClassifier()
Esempio n. 20
0
 def getEmbeddings(self):
     matrix = np.zeros((self.dictLength(), self.embedSize))
     embfiles = Utils.listFilesExt(self.embedPath, 'w2v')
     for i in embfiles:
         if ('kmer' in i.lower() and 'kmer' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'kmer', matrix)
         elif ('domain' in i.lower() and 'domain' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'domain', matrix)
         elif ('go' in i.lower() and 'go' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'go', matrix)
     return matrix
Esempio n. 21
0
    def createDomainDataset(self):
        useID = True
        files = Utils.listFilesExt(self.source_path, self.ext)
        files = [
            fileName for fileName in files if not os.path.isfile(
                self.result_path +
                os.path.basename(fileName).replace('.fasta', '.domains'))
        ]

        source_type = self.config.get('dataPipeline', 'source.type')
        count = 0
        countNone = 0
        datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                             source_path=self.source_path,
                                             result_path=self.result_path)

        sparkContext = SparkContext(conf=datapipe.initSpark("domainDataset"))
        pfamDomains = datapipe.getDomains(sparkContext)

        for file in files:
            fileName = os.path.basename(file)
            IDs = open(file, 'r').readline()

            resultFile = self.result_path + fileName.replace(
                '.fasta', '.domains')
            result = pfamDomains.get(file)

            #if(len(result) > 1):
            if (result is not None):
                result = result.split('\n')
                outF = open(resultFile, 'w')
                outF.write(IDs)
                output = ""

                for line in result:
                    if (len(line.strip()) > 1):
                        items = line.split('\t')
                        domainID = items[5]
                        domain = items[6]
                        bitscore = items[11]
                        if (useID):
                            domain = domainID + '|' + domain

                        outF.write(domain + '\n')
                        output += domain + '\n'

                outF.close()
                count += 1
            else:
                print('None for file: ', file)
                countNone += 1

        print('Done generating', str(count),
              'domain files. \nNo domain found for', str(countNone), 'files.')
Esempio n. 22
0
    def loadFilterMap(self, sparkContext):
        filterList = Utils.readFileLines(self.filterList)
        # returns tuple (((file, content), 'domains'))
        content = Parsers.parseDatasetContents(self.filterMap, 'domains',
                                               'domains')

        domRDD = sparkContext.parallelize(content, numSlices=1000)
        domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace(
            '.domains', ''), x[0][1]))

        # lists genes that have any domains in filterList
        # discards ".\d+" end of Pfam ID
        filter = domainsRDD.filter(lambda x: any(
            domain in filterList for domain in re.split("[\n.]", x[1])))

        result = filter.collectAsMap().keys()
        genes = sorted([i for i in result])

        print('Loaded filter:', len(genes), ' genes will be filtered from',
              len(filterList), 'domains.')
        return genes
Esempio n. 23
0
 def __init__(self, source_type=None, source_path=None, result_path=None):
     self.config = Utils.loadConfig()
     self.task = self.config.get('dataPipeline', 'task')
     self.source_path = self.config.get(
         'dataPipeline',
         'source.path') if source_path is None else source_path
     self.source_type = self.config.get(
         'dataPipeline',
         'source.type') if source_type is None else source_type
     self.result_path = self.config.get(
         'dataPipeline',
         'result.path') if result_path is None else result_path
     self.result_path = Utils.normalizePath(self.result_path)
     # create if it doesnt exist
     os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
     # recover the species name for using in temp files
     self.species = Utils.getSpecies(self.source_path)
     # temp dir + file used by sub-pipelines
     self.path = os.path.dirname(os.path.realpath(__file__))
     self.path += '/temp/'
     os.makedirs(os.path.dirname(self.path), exist_ok=True)
Esempio n. 24
0
def genBankToFasta():
    config = Utils.loadConfig()
    source = config.get('parsers', 'source.path')

    if not str(source).endswith('/'):
        output = source + '_fasta/'
        source += '/'
    else:
        source[len(source) - 1] = ''
        output = source + '_fasta/'
        source += '/'

    os.makedirs(os.path.dirname(output), exist_ok=True)

    list = genBankToAminoacid(source)
    content = ''

    for item in list:
        content += item + '\n'

    Utils.writeFile(output + 'fungi_complete' + '.fasta', content)
Esempio n. 25
0
    def loadW2V(self, embfile):
        ext = Utils.getFileExt(embfile)
        word2vec = []

        if('compact') in embfile:
            word2vec = Word2Vec.load(embfile, mmap='r')
        elif('.bin' in embfile):
            word2vec = KeyedVectors.load_word2vec_format(embfile, binary=True, limit=2000000)
        else:
            word2vec = KeyedVectors.load_word2vec_format(embfile, binary=False, limit=2000000)

        return word2vec
Esempio n. 26
0
    def createPfamTsv(self):
        listFiles = Utils.listFilesExt(self.source_path, 'domains')

        head = 'sequence_id\tprotein_id\tgene_start\tgene_end\tgene_strand\tpfam_id\tin_cluster\n'
        contentPos = ''
        contentNeg = ''

        for file in listFiles:
            fileContent = Utils.readFileLines(file)
            id = fileContent[0].replace('>', '')
            fileContent = fileContent[1:]
            inCluster = 1 if 'bgc' in os.path.basename(file).lower() else 0

            for line in fileContent:
                pfamId = line.split('|')[0]
                product = line.split('|')[1]
                currentLine = id + '\t' + product + '\t0\t0\t0\t' + pfamId + '\t' + str(
                    inCluster) + '\n'
                if (inCluster == 1):
                    contentPos += currentLine
                else:
                    contentNeg += currentLine

        contentPos = head + contentPos[:-1]
        contentNeg = head + contentNeg[:-1]

        folder = os.path.basename(os.path.dirname(file))
        resultPos = self.result_path + folder + '.positives.pfam.tsv'
        resultNeg = self.result_path + folder + '.negatives.pfam.tsv'

        Utils.writeFile(resultPos, contentPos)
        Utils.writeFile(resultNeg, contentNeg)
Esempio n. 27
0
    def countOccurrence(self, dataPath, sparkContext):
        feats = self.loadFeatures()
        contentIds = []

        listContents = Parsers.parseDatasetContents(dataPath, self.featType,
                                                    self.sourceType)
        parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1]

        for info in listContents:
            filename = info[0][0]
            content = info[0][1]
            type = info[1]
            firstLine = Utils.readFileLines(filename)[0]
            id = firstLine.replace(
                '>',
                '') if '|' in firstLine else firstLine.split('.')[0].replace(
                    '>', '')
            label = Utils.getLabel(filename)

            # avoid cases in which test synthetic genes are long and
            # in the split different clusters share same (gene) id
            for item in contentIds:
                if (id in item[0] and type in item[1]):
                    id = id + '|'

            contentIds.append(tuple([id, type, content, label]))

        sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000)
        occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats))

        # combine features with same ID and filter out instances with not enough features
        reducedRDD = occRDD.reduceByKey(
            lambda x, y: self.mergeFeatsSameId(x, y))

        ids = reducedRDD.map(lambda x: x[0]).collect()
        occ = reducedRDD.map(lambda x: x[1][0]).collect()
        labels = reducedRDD.map(lambda x: x[1][1]).collect()

        print('Features loaded.')
        return np.array(ids), np.array(occ), np.array(labels), parentDir
Esempio n. 28
0
    def createGoDataset(self):
        source_type = self.config.get('dataPipeline', 'source.type')
        blastPath = self.config.get('blaster', 'blastdb.path')
        blastPath = Utils.normalizePath(blastPath)
        blastName = self.config.get('blaster', 'blastdb.name')
        blastMapping = blastPath + blastName + '.tab'

        datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                             source_path=self.source_path,
                                             result_path=self.result_path)
        list, file_content = Parsers.parseFastaToList(self.source_path, "")
        file_content = [
            content for content in file_content if not os.path.isfile(
                self.result_path +
                os.path.basename(content[0]).replace('.fasta', '.go'))
        ]

        sparkContext = SparkContext(conf=datapipe.initSpark("goDataset"))
        goterms = datapipe.getBLAST(file_content,
                                    sparkContext,
                                    blastTask="goTerms")

        count = 0
        notFound = 0
        for file, content in goterms.items():

            length = content.split('\n')
            if (len(length) == 2 and not str(length[1])):
                notFound += 1
            else:
                filename = os.path.basename(file)
                resultFile = self.result_path + filename
                resultFile = resultFile.replace('.fasta', '.go')
                Utils.writeFile(resultFile, content)
                count += 1

        print('Done generating',
              str(count), 'GO term files. \nNo GO terms found for',
              str(notFound), 'files.')
Esempio n. 29
0
    def __init__(self, config, outputPath):
        self.sourcePath = config.get('prediction', 'source.path')
        self.sourcePath = Utils.normalizePath(self.sourcePath)
        self.trainPath = self.sourcePath + 'train/'
        self.outputPath = self.sourcePath + 'metricsDL/'
        self.sourceType = config.get('prediction', 'source.type')
        self.useEmbeddings = bool(config.get('prediction', 'use.embeddings'))
        self.embedPath = config.get('prediction', 'embed.path')
        self.embedPath = Utils.normalizePath(self.embedPath)

        if (self.useEmbeddings):
            self.featType = config.get('prediction', 'feat.type')
            self.featSize = config.get('prediction', 'feat.size')
            self.minOcc = config.get('prediction', 'feat.minOcc')
            self.embedSize = config.getint('prediction', 'embeddings.length')
            self.embeddingsName =  self.featType + self.featSize + 'minOcc' + str(self.minOcc) \
                                  + str(self.embedSize) + 'd'

        self.dictionary = dict()
        self.extractor = Extractor.Extractor(config, outputPath)
        self.featType = config.get('prediction', 'feat.type')
        self.maxLength = 0
Esempio n. 30
0
 def createActors(self,actors_num):
     actors = []
     r = random.randint(0,len(Const.ActorNameListA)-actors_num)
     name_nums_list = range(0+r,actors_num+r)
     role_nums_list = range(0,actors_num)
     random.shuffle(name_nums_list)
     random.shuffle(role_nums_list)
     for i in range(0,actors_num):
         id = i
         role_id = Const.RolesList[role_nums_list[i]] 
         name = UtilMethods.makeDefaultName(name_nums_list[i])
         village = self
         actors.append(Actor(i,name,role_id,village))
     return actors    
Esempio n. 31
0
def orthogroupSeqs(orthofile, seqpath, limit):
    orthodir = os.path.dirname(seqpath)
    ortholines = Utils.readFileLines(orthofile)[1:]
    seqpath = Utils.listFilesExt(seqpath, "fasta")
    threshold = limit if (limit) else len(seqpath)
    orthogroups = [re.split('\t|;|,', item)[1:]
                   for item in ortholines][1:threshold + 1]
    sequences, output = dict(), dict()

    print('Loading files and seqIDs...')
    for seqfile in seqpath:
        sequences.update(SeqIO.index(seqfile, "fasta"))

    orthodir = orthodir + '/orthologs_threshold' + str(threshold) + '/'

    if os.path.isdir(orthodir):
        print('Orthogroup path', orthodir, 'already exists.')
        exit()
    else:
        os.makedirs(orthodir)

        print('Loading sequences per IDs...')
        for group in orthogroups:
            for id in group:
                id = id.strip(' ')
                tempseq = sequences.get(id)

                if (tempseq is not None and len(tempseq) > 1):
                    thisseqfile = orthodir + tempseq.id + '.fasta'
                    content = '>' + tempseq.id + '\n' + tempseq.seq
                    Utils.writeFile(thisseqfile, content)
                # else:
                #     print('ID not found', str(id))

    print('Done writing seqs for orthogroups.')
    return output