def __init__(self): self.config = Utils.loadConfig() self.task = self.config.get('eval', 'task') self.gold = self.config.get('eval', 'goldID.path') self.result = self.config.get('eval', 'result.path') self.threshold = float(self.config.get('eval', 'threshold')) self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter')) self.Similarity = Similarity.Similarity(self.config) self.Filter = Filter.Filter(self.config, sparkContext=self.sparkContext) self.Merger = Merger.Merger(self.config) self.goldIDs = Utils.readFileLines(self.gold)[1:] self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test') # total nb of gold genes self.nbGoldGenes = len(self.goldIDs) # total nb of gold clusters self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0) self.goldGenes = [ gene for genes in self.foldedGold.values() for gene in genes ] self.nbGoldClusters = len(self.foldedGold) self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n' self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
def clustersToGFF(clusterspath, gffpath, goldpath, annotpath, source_type): gffcontent = Gff3(gffpath) clustercontent, goldContent, annotationContent = "", "", "" clustercontent = Utils.readFileLines(clusterspath) clusters = Utils.foldClusterData( clustercontent, "", 0.5) if 'score' in clusterspath else Utils.foldClusterData( clustercontent, "gold", "") goldContent = '\t'.join(Utils.readFileLines(goldpath)) if goldpath else "" annotationList = Utils.readFileLines(annotpath) if annotpath else "" annotationContent = ('\n').join(annotationList) if annotpath else "" # sort dict by key clusters = OrderedDict(sorted(clusters.items(), key=lambda x: x[0])) gffclusterfile = clusterspath.rsplit('.', 1)[0] + '.percluster.gff3' gffgenefile = clusterspath.rsplit('.', 1)[0] + '.pergene.gff3' outputcluster, outputgene = "##gff-version 3\n", "##gff-version 3\n" # filter only "mRNA" features, return dict {gene name, gff line} mRNAdict = { line['attributes']['Name'].replace('.1', ''): line for line in gffcontent.lines if line['type'] == 'mRNA' } for key, value in clusters.items(): for gene in value: gene = gene.replace('.1', '') thisgene = mRNAdict.get(gene) if (thisgene is not None): chr = thisgene['seqid'] position = str(thisgene['start']) + '\t' + str(thisgene['end']) score = '?' strand = thisgene['strand'] phase = thisgene['phase'] info = 'Name=' + gene + ';Note=' + key + '\n' if (goldContent): if (gene in annotationContent): annot = [ item for item in annotationList if gene in item ] annot = annot[0].split('\t')[1] if annot else '' if ('backbone' in annot): info = info.replace("\n", ";color=#EE0000\n") # red elif ('tailor' in annot): info = info.replace("\n", ";color=#EE9300\n") # orange elif ('transcript') in annot: info = info.replace( "\n", ";color=#048014\n") # forest green elif ('transport' in annot): info = info.replace( "\n", ";color=#1888f0\n") # light blue elif (gene in goldContent): info = info.replace( "\n", ";color=#9931f2\n") # bright purple outputgene += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info else: print('gene not found:', gene) startID = value[0].replace('.1', '') endID = value[-1].replace('.1', '') startGene = mRNAdict.get(startID) endGene = mRNAdict.get(endID) chr = startGene['seqid'] position = str(startGene['start']) + '\t' + str(endGene['end']) strand = startGene['strand'] phase = startGene['phase'] score = '?' info = 'Name=' + key + ';Note=' + ('|').join(value) + '\n' outputcluster += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info Utils.writeFile(gffclusterfile, outputcluster) Utils.writeFile(gffgenefile, outputgene) return gffcontent