def getDomains(self, sparkContext): # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) domainFinder = DomainFinder.DomainFinder() # load source sequences into a single list if ("fasta" in self.source_type): list, file_content = Parsers.parseFastaToList(self.source_path, "") elif ("genbank" in self.source_type): list = Parsers.genBankToAminoacid(self.source_path) print('Processing domains...') # create RDD with source sequences sourceRDD = sparkContext.parallelize(file_content, numSlices=2000) if ("nucleotide" in self.source_type): # execute sixFrame translation for each sequence in RDD sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x)) # execute Pfam domain prediction for each sixFrame translation in RDD domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1])) processedRDD = domainsRDD.map( lambda x: self.processDomainOutput(x[0], x[1])) # recover Pfam domain prediction results from RDD result = processedRDD.collectAsMap() print('Done!') return result
def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType, rewardType): rewardperfeat = {} # tuple of shape {(file, content, id),'kmers'} resultLabel = Parsers.parseDatasetContents(dataPath, featType, sourceType) fileindex = list(set([i[0][0] for i in resultLabel])) for item in resultLabel: filename = item[0][0] label = Utils.getLabel(filename) content = item[0][1] idx = fileindex.index(filename) occ = 1 if label == 1 else -1 if (content in rewardperfeat): if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ else: rewardperfeat[content] = [0] * len(fileindex) if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ outputstr = '' for k, v in rewardperfeat.items(): outputstr += k + '\t' + (',').join(map(str, v)) + '\n' Utils.writeFile(outputPath, outputstr[:-1]) return rewardperfeat
def protAnalysis(self, content): result, resultFlexDic = dict(), dict() content = Parsers.normalizeSequence(content, self.sourceType) protein = ProteinAnalysis(content) result['proteinMWeight'] = protein.molecular_weight() result['proteinAroma'] = protein.aromaticity() result['proteinInstab'] = protein.instability_index() result['proteinIsoelec'] = protein.isoelectric_point() result['proteinGravy'] = protein.gravy() proteinStructure = protein.secondary_structure_fraction() protStruct = self.flatten('proteinSecstruc', proteinStructure) result = {**protStruct, **result} # merge result and protein Structure flexibility = protein.flexibility() flexibFlat = self.flatten('proteinFlex', flexibility) flexibAmino = self.flatten(list(content), flexibility) flattened = {**flexibFlat, **result} flattenedFlexDic = {**flexibAmino, **result} return result, flattened, flattenedFlexDic,
def prepareData(self, path, sparkContext): dataset = Parsers.parseDatasetContents(path, 'domains_pfam', 'domains') contentRDD = sparkContext.parallelize(dataset, numSlices=1000) perinstanceRDD = contentRDD.map(lambda x: (x[0][2], [x[0][1]])).reduceByKey(add) # format tuple {filename, ([domains], fastaID(genes))} return perinstanceRDD.collect()
def createNegShuffle(self, posPerc): files = Utils.listFilesExt(self.source_path, self.ext) negPerc = 100 - posPerc positives = len(files) negativeSize = int((negPerc * positives) / posPerc) print('Negative percentage: ' + str(negPerc) + '% \n' + 'Negative instances: ' + str(negativeSize) + '\n' + 'Positive percentage: ' + str(posPerc) + '% \n' + 'Positive instances: ' + str(positives) + '\n' + 'Total corpus size: ' + str(negativeSize + positives)) thisDecRatio = 0.0 count = 0 ratio = (negativeSize / positives) decRatio = ratio - int(ratio) print('Generating...') for file in files: # add up the decimal ratio part thisDecRatio += round(decRatio, 2) # reset range ratioRange = int(negativeSize / positives) # check if decimal ratio added up to a duplicate if (thisDecRatio >= 1): ratioRange = int(ratio + thisDecRatio) thisDecRatio = 0 for i in range(0, ratioRange): name = os.path.basename(file) result_file = name.split('.')[0] + '_' + str( i) + '.shuffled.negative.fasta' if ('nuc' in self.seqType): content = Parsers.genBankToNucleotide(file) if ('amino' in self.seqType): list, content = Parsers.genBankToAminoacid(file) content = Utils.charGramShuffle(content, 2) content = '>' + name + '\n' + content count += 1 Utils.writeFile(self.result_path + result_file, content) print('Total generated: ' + str(count) + '. Done!')
def loadGeneMap(self, sparkContext): content = Parsers.parseDatasetContents(self.geneMapPath, 'domains', 'domains') contentRDD = sparkContext.parallelize(content, numSlices=1000) genemapRDD = contentRDD.map( lambda x: (x[0][2], x[0][1].split('\n'))).reduceByKey(add) genemap = genemapRDD.collectAsMap() return genemap
def __init__(self): # class like singleton # we have all parsers here # all records blueprints # last record id as well here self.console_logger = True self.prs = Parsers() self.account_opened = False self.record_id = 0 self.logger.debug("Created multiplexer for user's commands")
def loadFilterMap(self, sparkContext): filterList = Utils.readFileLines(self.filterList) # returns tuple (((file, content), 'domains')) content = Parsers.parseDatasetContents(self.filterMap, 'domains', 'domains') domRDD = sparkContext.parallelize(content, numSlices=1000) domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace( '.domains', ''), x[0][1])) # lists genes that have any domains in filterList # discards ".\d+" end of Pfam ID filter = domainsRDD.filter(lambda x: any( domain in filterList for domain in re.split("[\n.]", x[1]))) result = filter.collectAsMap().keys() genes = sorted([i for i in result]) print('Loaded filter:', len(genes), ' genes will be filtered from', len(filterList), 'domains.') return genes
def countOccurrence(self, dataPath, sparkContext): feats = self.loadFeatures() contentIds = [] listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1] for info in listContents: filename = info[0][0] content = info[0][1] type = info[1] firstLine = Utils.readFileLines(filename)[0] id = firstLine.replace( '>', '') if '|' in firstLine else firstLine.split('.')[0].replace( '>', '') label = Utils.getLabel(filename) # avoid cases in which test synthetic genes are long and # in the split different clusters share same (gene) id for item in contentIds: if (id in item[0] and type in item[1]): id = id + '|' contentIds.append(tuple([id, type, content, label])) sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000) occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats)) # combine features with same ID and filter out instances with not enough features reducedRDD = occRDD.reduceByKey( lambda x, y: self.mergeFeatsSameId(x, y)) ids = reducedRDD.map(lambda x: x[0]).collect() occ = reducedRDD.map(lambda x: x[1][0]).collect() labels = reducedRDD.map(lambda x: x[1][1]).collect() print('Features loaded.') return np.array(ids), np.array(occ), np.array(labels), parentDir
def buildDataset(self, path, sparkContext): result, ids, labels = [], [], [] #files = Utils.listFilesExt(path, 'fasta') # define pickle protocol to bypass 4GiB pickling limit broadcast.Broadcast.dump = self.broadcast_dump dataset = Parsers.parseDatasetContents(path, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(dataset[0][0][0]))[1] listRDD = sparkContext.parallelize(dataset, numSlices=5000) # X tuple in format: # ((fileName, content, sequenceID), featureType) featuresRDD = listRDD.map(lambda x: (x[0][2], self.extractor.getFeatures(x))) concatRDD = '' if("pfam" in self.featType): # concatenate by pfam ID: label positive if at least one file contains domain concatRDD = featuresRDD.map(lambda x: (''.join(x[1]), [x[0]])).reduceByKey(add) else: # concatenate contents by file ID concatRDD = featuresRDD.map(lambda x: (x)).reduceByKey(add) # add instance # X tuple in format: (file ID, [feature, feature, ...]]) datasetRDD = concatRDD.map(lambda x: self.addInstance(x)) dataset = datasetRDD.collectAsMap() #if "pfam" not in self.featType else datasetRDD.collect() # get max length among all maxLen = 1 if "pfam" in self.featType else int(datasetRDD.sortBy(lambda x: x[0][1], False).first()[0][1]) self.maxLength = maxLen if maxLen > self.maxLength else self.maxLength # 0 = fasta.id, 1 = instance length, 2 = file name, 3 = label for k, v in dataset.items(): id, label = k[0], int(k[2]) ids.append(id) labels.append(label) result.append(v) print('Done building dataset.') return ids, result, labels, parentDir
def createGoDataset(self): source_type = self.config.get('dataPipeline', 'source.type') blastPath = self.config.get('blaster', 'blastdb.path') blastPath = Utils.normalizePath(blastPath) blastName = self.config.get('blaster', 'blastdb.name') blastMapping = blastPath + blastName + '.tab' datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) list, file_content = Parsers.parseFastaToList(self.source_path, "") file_content = [ content for content in file_content if not os.path.isfile( self.result_path + os.path.basename(content[0]).replace('.fasta', '.go')) ] sparkContext = SparkContext(conf=datapipe.initSpark("goDataset")) goterms = datapipe.getBLAST(file_content, sparkContext, blastTask="goTerms") count = 0 notFound = 0 for file, content in goterms.items(): length = content.split('\n') if (len(length) == 2 and not str(length[1])): notFound += 1 else: filename = os.path.basename(file) resultFile = self.result_path + filename resultFile = resultFile.replace('.fasta', '.go') Utils.writeFile(resultFile, content) count += 1 print('Done generating', str(count), 'GO term files. \nNo GO terms found for', str(notFound), 'files.')
def extractFeatures(self, dataPath, sparkContext, featPerInst): files, feats, kmerCounts, featPerInstance = [], [], [], [] useKmer = True if 'kmers' in self.featType else False useProt = True if 'prot' in self.featType else False useDistinct = True if 'dist' in self.featType else False listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) if ('dictionary' in self.featType): feats += sorted(self.loadDictionary(self.dictPath)) else: #if('domains' in self.featType or 'kmers' in self.featType): featRDD = sparkContext.parallelize(listContents, numSlices=1000) featuresRDD = featRDD.map(lambda x: (x[1], self.getFeatures(x))) if (featPerInst): # get a list of features per instance for embeddings featPerInstance = featuresRDD.values().collect() print(len(featPerInstance), 'instances processed.') if (not os.path.isfile(self.featFile)): if (useKmer): # filter RDD and return only kmers, "flatten" arrays to single list of kmers kmerRDD = featuresRDD.filter( lambda x: "kmer" in x[0]).flatMap(lambda x: x[1]) # change each element to (k, v), reduce list by keys to group # + count features, filter features by minOcc minOcc = int(self.minOcc) countAndFilter = kmerRDD.map(lambda x: (x, 1)).reduceByKey( add).filter(lambda x: x[1] >= minOcc) # remove counts and collect only keys kmerCounts = sorted(countAndFilter.collect()) feats += sorted(countAndFilter.keys().collect()) # filter out kmers already processed featuresRDD = featuresRDD.filter( lambda x: "kmer" not in x[0]) if (useProt): # filter RDD and return only prot properties protRDD = featuresRDD.filter( lambda x: "protanalys" in x[0]) # select (unique) feature names feats += sorted( protRDD.flatMap(lambda x: x[1]).distinct().collect()) featuresRDD = featuresRDD.filter( lambda x: "protanalys" not in x[0]) # get a flat list of all features #if(useDistinct): # completeFeatures = featuresRDD.flatMap(lambda x: x[1]).distinct() #else: #completeFeatures = featuresRDD.flatMap(lambda x: x[1]) completeFeatures = featuresRDD.flatMap( lambda x: x[1]).distinct() feats += completeFeatures.collect() if (len(feats) > 1): allFeatures = ''.join(str(i) + '\n' for i in feats) Utils.writeFile(self.featFile, allFeatures) if (len(kmerCounts) > 1): kmerCounts = ''.join( str(i).replace('(\'', '').replace('\',', '\t').replace( ')', '') + '\n' for i in kmerCounts) Utils.writeFile(self.featFile + 'count', kmerCounts) print(len(feats), 'features extracted.') else: feats = self.loadFeatures() return feats, featPerInstance, kmerCounts
def splitAsClusters(self): self.source_path = Utils.normalizePath(self.source_path) slimIDs = self.config.getboolean('corpusPrep', 'slim.id') files = Utils.listFilesExt(self.source_path, self.ext) result = [] overlap = int((self.windowOverlap / 100) * self.length) for file in files: fileName = os.path.basename(file).split('.')[0] self.result_path = self.result_path + fileName + '_len' + str( self.length) + '_overlap' + str(self.windowOverlap) if (slimIDs): self.result_path += '_slimIDs' #self.result_path += '/' self.result_path += '.fasta' if (os.path.isfile(self.result_path)): print('File already exists: ' + self.result_path + '.\nDone.') else: file = Parsers.sortFasta(file) sequences = Parsers.parseFasta(file) content, ids, entry, overlapIds = '', '', '', '' for fasta in sequences: content += str(fasta.seq.upper()) ids += str(fasta.id) if not ids else '|' + str(fasta.id) if (slimIDs): allIds = ids.split('|') ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] while (len(content) > 0): varSize = self.length - (len(entry)) if (varSize <= overlap): overlapIds += str( fasta.id) if not overlapIds else '|' + str( fasta.id) entry += content[0:varSize] if (len(entry) == self.length): # move cursor on real sequence according to variable length added content = content[varSize:] # add chunk to list if (slimIDs): allIds = ids.split("|") ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] result.append('>' + ids + '\n' + entry) # make sure that entry contains overlap entry = entry[len(entry) - overlap:] if (len(content) > 0): ids = overlapIds overlapIds = '' else: ids = '' elif (len(content) > 0 and len(entry) < self.length): content = content[len(entry):] prev = 0 pos = self.length result = '\n'.join(result) Utils.writeFile(self.result_path, result) print('Done.')
class CmdMux(object): '''This is the main control unit, sends command to all units''' # use our logger metaclass __metaclass__ = CmdLog def __init__(self): # class like singleton # we have all parsers here # all records blueprints # last record id as well here self.console_logger = True self.prs = Parsers() self.account_opened = False self.record_id = 0 self.logger.debug("Created multiplexer for user's commands") # switcher for loggers def debug(self, params): debug_toggle = self.prs.debug_check(params) if debug_toggle: if self.console_logger and debug_toggle == 'on': return msg.logger_error_on elif self.console_logger and debug_toggle == 'off': self.console_logger = False self.logger.removeHandler(self.out_handler) return msg.logger_off elif not self.console_logger and debug_toggle == 'off': return msg.logger_error_off else: self.console_logger = True self.logger.addHandler(self.out_handler) return msg.logger_on else: return msg.logger_error # info methods that do not need any value or calculation def hello(self): self.logger.debug("Wallet program initiated") return msg.hello_message def quit(self): self.logger.debug("Exiting from wallet") return msg.quit_message def help(self): self.logger.debug("Asking for help") return msg.help_message def tutorial(self): self.logger.debug("Asking for tutorial") return msg.tutorial_message def no_account(self): self.logger.error("No accounts opened") return msg.no_account_message # main work with accounts creation/initialization def new_account(self, params): '''method that parses already have dict object for new account it should create all memory units, records stores, save them and write first history records for new account''' last_record_id = 0 self.logger.debug("Creating new clean account") # shortest way for now # since all store(cash, debit, credit, savings) are # secure, I do not assgin values on instance creation # only by setters # if not all(i is not None and isinstance(i, str) for i in # [name, cash, debit, credit, savings]): # cannot check cause # UnboundLocalError: local variable 'cash' referenced before assignment blueprint = self.prs.new_account_check(params) if blueprint: self.account_opened = True # creating temp objects tmpname = blueprint['name'] # true for credit means it will be # initiate like if we already own money # of course you can set value 0 tmpcash, tmpdebit, tmpcredit, tmpsavings = \ Cash(), Debit(), Credit(True), Savings() tmppayments, tmphistory = PayHistory(), BalanceHistory() # assigning values tmpcash.value = int(blueprint['cash']) tmpdebit.value = int(blueprint['debit']) tmpcredit.value = int(blueprint['credit']) tmpsavings.value = int(blueprint['savings']) # saving complete object (new account, everything is empty) self.account = { 'name': tmpname, 'cash': tmpcash, 'debit': tmpdebit, 'credit': tmpcredit, 'savings': tmpsavings, 'payments': tmppayments, 'history': tmphistory, 'last_record_id': last_record_id, } self.__snapshot_history() # in fact, this is first record of # account's life self.__save() return msg.new_account_message else: # if we here, blueprint is not correct # we send new_account_error return msg.new_account_error def open_account(self, params): '''method to open pickle file, and load just one big dict object with all our Cash(), BalanceRecords(), etc''' self.logger.debug("Opening your account") account_name = self.prs.open_account_check(params) if account_name: try: with open(account_name + '.pickle', 'rb') as f: # in fact we load everything quite easy # into dict object so all methods can work with it self.account = pickle.load(f) # last_record_id is stored in account itself self.record_id = self.account['last_record_id'] self.account_opened = True return msg.open_account_message except TypeError: self.logger.error("Failure during opening") return msg.open_account_error except IOError: self.logger.error("Account does not exist") return msg.no_exists_error else: self.account_opened = False return msg.open_account_name_error # status methdos, that needs objects of accounts and record def balance(self): self.logger.debug("Getting current balance") account_name = self.account['name'] out = [self.cash(), self.debit(), self.credit(), self.savings()] return "{0} of {1} account\n {2}".format(msg.balance_message, account_name, out) def cash(self): self.logger.debug("Getting your cash") account_cash = self.account['cash'].value return "{0}: {1}".format(msg.cash_message, account_cash) def debit(self): self.logger.debug("Getting your debit account") account_debit = self.account['debit'].value return "{0}: {1}".format(msg.debit_message, account_debit) def credit(self): self.logger.debug("Getting your credit account") account_credit = self.account['credit'].value return "{0}: {1}".format(msg.credit_message, account_credit) def savings(self): self.logger.debug("Getting your savings") account_savings = self.account['savings'].value return "{0}: {1}".format(msg.savings_message, account_savings) # history methods, for last payments and balance history def history(self, params): self.logger.debug("Getting your last balance history") num = self.prs.history_check(params) if num: # return list of balance record objects records = self.account['history'].get_record(int(num)) # each record is BalanceRecord out = '' for record in records: line = "|{}|{}|cash:{}|debit:{}|credit:{}|safe:{}|\n".format( record.last_id, record.date, record.cash, record.debit, record.credit, record.savings,) out += line return msg.history_message+'\n'+out else: return msg.history_error def payments(self, params): self.logger.debug("Getting you last payments") num = self.prs.payments_check(params) if num: # return list of payments objects records = self.account['payments'].get_payment(int(num)) # each record is PayRecord out = '' for record in records: line = "|{}|{}|from:{}|sum:{}{}|comment:{}|\n".format( record.last_id, record.date, record.category, record.sign, record.value, record.comment,) out += line return msg.payments_message+'\n'+out else: return msg.payments_error return msg.payments_message # real operation methods def pay(self, params): self.logger.debug("Spending money") blueprint = self.prs.pay_check(params) if blueprint: money_i_have = self.account[blueprint['category']].value if blueprint['category'] not in category_list: return msg.pay_category_error elif ((blueprint['category'] != 'credit') and int(blueprint['value']) > int(money_i_have)): return msg.pay_no_money_error else: pay = PayRecord(blueprint['category'], blueprint['value'], blueprint['comment']) # incrementing ids self.record_id += 1 pay.last_id += self.record_id pay.sign = '-' self.account['last_record_id'] = self.record_id # when we pay, account always decreasing # it's up to Credit class to identify # that '-' means increase debt # or we can specify credit limit and # show limit - value in view self.account[blueprint['category']].value -= \ int(blueprint['value']) self.account['payments'].put_payment(pay) # we always making snapshow in history # and alway save pickle file self.__snapshot_history() self.__save() return msg.pay_message else: return msg.pay_error def income(self, params): self.logger.debug("Great, we got money now") blueprint = self.prs.income_check(params) if blueprint: if blueprint['category'] not in category_list: return msg.income_category_error else: income = PayRecord(blueprint['category'], blueprint['value'], blueprint['comment']) self.record_id += 1 income.last_id += self.record_id income.sign = '+' self.account['last_record_id'] = self.record_id # when we got money it's alway '+' # and it's also up to Credit account to # parse it as debt decreasing self.account['payments'].put_payment(income) self.account[blueprint['category']].value += \ int(blueprint['value']) self.__snapshot_history() self.__save() return msg.income_message else: return msg.income_error def withdraw(self, params): self.logger.debug("Taking money from acoount to cash") blueprint = self.prs.withdraw_check(params) if blueprint: if blueprint['category'] not in withdraw_category_list: return msg.withdraw_category_error else: pay = PayRecord(blueprint['category'], blueprint['value'], "withdraw") self.record_id += 1 pay.last_id += self.record_id self.account['last_record_id'] = self.record_id # withdraw is alway taking money from any account # into cash # therefore, we check limit and check self.account['payments'].put_payment(pay) self.account[blueprint['category']].value -= \ int(blueprint['value']) self.account['cash'].value += int(blueprint['value']) self.__snapshot_history() self.__save() return msg.succes_withdraw_message else: return msg.withdraw_error return msg.withdraw_message # sync method to update pickle file def __save(self): self.logger.debug("Saving all to pickle file") with open(self.account['name'] + ".pickle", 'wb') as f: pickle.dump(self.account, f, -1) return msg.save_message # sync method place stats for balance history def __snapshot_history(self): self.logger.debug("Updating balance history") last_record = BalanceRecord(self.account['cash'].value, self.account['debit'].value, self.account['credit'].value, self.account['savings'].value,) self.account['history'].put_record(last_record) # here we never just call snapshot_history_message # without doing nothing # alway payment, or withdraw or income last_record.last_id = self.record_id return msg.snapshot_history_message