def main(): # check that everything has been installed correctly root = os.path.dirname(os.path.abspath(__file__)) root += '/' #Get the parameters args, parser = get_params() #set the required variables debug = args.debug # output dir outDir = '{:s}/mmseqs2_src/'.format(os.path.realpath( args.output_directory)) # Make some checks before extractic the package # check that GCC is installed gccVer, gccOk = check_gcc() if not gccOk: sys.stderr.write( '\nERROR: no GCC compiler was found in your system.\n') sys.stderr.write('Please, go to https://gcc.gnu.org\n') sys.stderr.write( 'And follow the instructions to install the latest version of GCC in your system.\n' ) sys.exit(-5) # check that cmake is installed cmake3Ver, cmake3Ok = check_cmake3() cmakeVer, cmakeOk = check_cmake() # if None of the versions is installed then exit with an error if not (cmake3Ok or cmakeOk): print( 'ERROR: you must install cmake version 3.10 or above before continuing' ) sys.stderr.write('Please, go to https://cmake.org\n') sys.stderr.write( 'And follow the instructions on how to install the latest version of cmake in your system.\n' ) sys.exit(-5) # path to the source package tarPath = os.path.join(root, 'mmseqs2_src/mmseqs.tar.gz') # skip the extration if the directory already exists if os.path.isdir(outDir): print('WARNING: the directory\n{:s}'.format(outDir)) print('already exists, if you want to extract the package,') print('please remove the above-mentioned directory.') print('\nEXIT: no file was extracted.') sys.exit(-2) # create the directory if it does not exist systools.makedir(outDir) #if debug: # print('The source for MMseqs2 will be stored in\n{:s}'.format(outDir)) systools.untar(tarPath, outDir=outDir, debug=debug) # create the build Directory systools.makedir(os.path.join(outDir, 'build/'))
def mmseqs_createindex(dbPath, debug=False): """Create a index from a mmseq2 database file.""" if debug: print('mmseqs_createindex :: START') print('Input mmseqs2 db file:\t%s' % dbPath) #check that the database file exist if not os.path.isfile(dbPath): sys.stderr.write( 'The file %s was not found, please provide the path to a mmseqs2 database file' % dbPath) sys.exit(-2) #''' USE IN FUTURE VERSION OF CREATEINDEX #print(dbPath) tmpBname = os.path.basename(dbPath) tmpDir = '{:s}/tmp_{:s}/'.format( os.path.dirname(dbPath), os.path.basename(tmpBname.split('.', 1)[0])) systools.makedir(tmpDir) makeIdxCmd = '{:s} createindex {:s} {:s} --threads 2 -v 0'.format( get_mmseqs_path(), dbPath, tmpDir) #''' # command to be executed # EXAMPLE; mmseqs createindex in.mmseqs2_db #makeIdxCmd = '{:s} createindex {:s} -v 0'.format(get_mmseqs_path(), dbPath) if debug: print('mmseqs2 createindex CMD:\t%s' % makeIdxCmd) #execute the system call process = subprocess.Popen(makeIdxCmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = process.communicate() #get stdout and stderr process.wait() if debug: print('STDOUT:\n%s\n' % stdout_val) print('STDERR:\n%s\n' % stderr_val) # make sure that the 3 idx files have been properly created idx1 = '%s.sk6' % dbPath if not os.path.isfile(idx1): sys.stderr.write('The MMseqs2 index file %s could not be created.' % idx1) sys.exit(-2) idx2 = '%s.sk6.index' % dbPath if not os.path.isfile(idx2): sys.stderr.write( '\nWARNING: The MMseqs2 index file %s could not be created.' % idx2) #sys.exit(-2) # remove the temporary directory shutil.rmtree(path=tmpDir) # return a output tuple return (stdout_val, stderr_val, makeIdxCmd, idx1, idx2)
def mmseqs_createdb(inSeq, outDir=os.getcwd(), debug=False): """Create a database file for mmseqs2 from the input sequence file.""" if debug: print('mmseqs_createdb :: START') print('Input FASTA file:\t%s' % inSeq) print('Outdir:\t%s' % outDir) #check that the input file and the database exist if not os.path.isfile(inSeq): sys.stderr.write( 'The file %s was not found, please provide the path to a valid FASTA file' % inSeq) sys.exit(-2) #check if the database exists if outDir[-1] != '/': outDir += '/' # create dir if not already exists systools.makedir(outDir) # check the set db name dbName = os.path.basename(inSeq) dbName = dbName.split('.')[0] # take the left part of the file name dbName = '%s.mmseqs2db' % dbName dbPath = '%s%s' % (outDir, dbName) # command to be executed # EXAMPLE; mmseqs createdb in.fasta /outdir/mydb makeDbCmd = '%s createdb %s %s -v 0' % (get_mmseqs_path(), inSeq, dbPath) if debug: print('mmseqs2 createdb CMD:\t%s' % makeDbCmd) #execute the system call process = subprocess.Popen(makeDbCmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = process.communicate() #get stdout and stderr process.wait() if debug: print('STDOUT:\n%s\n' % stdout_val) print('STDERR:\n%s\n' % stderr_val) #return a tuple with the results return (stdout_val, stderr_val, makeDbCmd, dbPath)
def main(): # check that everything has been installed correctly root = os.path.dirname(os.path.abspath(__file__)) root += '/' #Get the parameters args, parser = get_params() #set the required variables debug = args.debug # output dir outDir = '{:s}/sonicparanoid_test/'.format( os.path.realpath(args.output_directory)) # path to the source package testSrcDir = os.path.join(root, 'example/') # skip the extration if the directory already exists if os.path.isdir(outDir): print('WARNING: the directory\n{:s}'.format(outDir)) print('already exists, if you want to extract the package,') print('please remove the above-mentioned directory.') print('\nEXIT: no file copied.') sys.exit(-2) # create the directory if it does not exist systools.makedir(outDir) # copy the test files print(outDir) copytree(testSrcDir, outDir, symlinks=False, ignore=None) #copytree(testSrcDir, outDir) if os.path.isdir(outDir): print('INFO: all test files were succesfully copied to\n{:s}\n'.format( outDir)) # suggest the command to run print('Go inside the directory\n{:s}\nand type\n'.format(outDir)) print('sonicparanoid -i ./test_input -o ./test_output -m fast -t 4')
def main(): # check that everything has been installed correctly root = os.path.dirname(os.path.abspath(__file__)) root += '/' # get gcc version ##gccVer, gccOk = check_gcc() # compile binaries for multi-species orthology if required ''' check_quick_multiparanoid_installation(gccVer, gccOk) check_quick_multiparanoid_installation(gccVer, gccOk) ''' #### FORCE IT TO OK check_quick_multiparanoid_installation( 'FORCED TO TRUE: Sytem specific GCC version', True) #### # check MMseqs2 installation # check_mmseqs_installation(root) # start measuring the execution time ex_start = time.perf_counter() #Get the parameters args, parser = get_params() #start setting the needed variables debug = args.debug inDir = None if args.input_directory is not None: inDir = '%s/' % os.path.realpath(args.input_directory) # output dir outDir = '%s/' % os.path.realpath(args.output_directory) # create input directory inside the output directory tmpInDir = os.path.join(outDir, 'input/') # if it already exists make sure all files in it are removed if os.path.isdir(tmpInDir): for el in os.listdir(tmpInDir): os.remove(os.path.join(tmpInDir, el)) else: systools.makedir(tmpInDir) # move input files to the temporary input directory inProtCnt = 0 for f in os.listdir(inDir): if f.startswith('.'): continue tmpPath = os.path.join(inDir, f) inProtCnt += 1 if os.path.isfile(tmpPath): systools.copy(tmpPath, tmpInDir, debug=debug) # INPUT CHECK inDir = tmpInDir # check integrity of input files inputModList = check_input_files(inDir, debug=debug) # rename the input files if where required for tpl in inputModList: newPath, newSymbol = tpl print(newPath) oldPath = newPath.replace('_no_blanks', '') # remove the old file and rename the one with valid headers if os.path.isfile(oldPath): print(oldPath) os.remove(oldPath) systools.move(newPath, oldPath, debug=debug) if debug: print('Input files which FASTA header was modified:\t%d' % len(inputModList)) # Optional directories setup alignDir = None if args.shared_directory is not None: alignDir = '%s/' % os.path.realpath(args.shared_directory) else: alignDir = os.path.join(outDir, 'alignments/') systools.makedir(alignDir) dbDirectory = None if args.mmseqs_dbs is not None: dbDirectory = '%s/' % os.path.realpath(args.mmseqs_dbs) else: dbDirectory = '%smmseqs2_databases/' % outDir cpus = args.threads #coff = args.cutoff coff = 40 owOrthoTbls = args.overwrite_tables multiOnly = args.multi_species_only skipMulti = args.skip_multi_species runMode = args.mode maxGenePerSp = args.max_gene_per_sp # set the sensitivity value for MMseqs2 sensitivity = 4.0 # default if runMode == 'sensitive': sensitivity = 6.0 elif runMode == 'fast': sensitivity = 2.5 elif runMode == 'most-sensitive': sensitivity = 7.5 overwrite = args.overwrite if overwrite: owOrthoTbls = True # set sensitivity using a user spcified value if needed if args.sensitivity: if 1. <= args.sensitivity <= 7.5: sensitivity = round(args.sensitivity, 1) print( 'WARNING: the run mode \'%s\' will be overwritten by the custom MMseqs sensitivity value of %s.\n' % (runMode, str(args.sensitivity))) else: sys.stderr.write( '\nERROR: the sensitivity parameter must have a value between 1.0 and 7.5\n' ) # set the maximum length difference allowed if difference from default if args.max_len_diff != 0.5: if not (0. <= args.max_len_diff <= 1.0): sys.stderr.write( '\nERROR: the legth difference ratio must have a value between 0 and 1.\n' ) # set the variable to control the creation of orthologous pairs output_relations = args.output_pairs if args.qfo_2011: output_relations = True updateId = None if args.update is not None: updateId = args.update #check that mutually exclusive options are not selected if multiOnly and skipMulti: sys.stderr.write( '\nERROR: you cannot select the options --skip-multi-species and --multi-species-only at the same time.\n' ) sys.exit(-5) # set the variable for MMseqs2 database indexing idx_dbs = True if args.no_indexing: idx_dbs = False # name for multispecies groups multiSpeciesClstrNameAll = 'multispecies_clusters.tsv' print('\nSonicParanoid will be executed with the following parameters:') print('Input directory:\t{:s}'.format(inDir)) print('Input proteomes:\t{:d}'.format(inProtCnt)) print('Output directory:\t{:s}'.format(outDir)) print('Alignments directory:\t{:s}'.format(alignDir)) print('Create pre-filter indexes:\t{:s}'.format(str(idx_dbs))) print('Complete overwrite:\t{:s}'.format(str(overwrite))) print('Re-create ortholog tables:\t{:s}'.format(str(owOrthoTbls))) print('CPUs:\t{:d}'.format(cpus)) print('Run mode:\t%s (MMseqs2 s=%s)' % (runMode, str(sensitivity))) # check that the input directory has been provided if (inDir is None): sys.stderr.write('\nERROR: no input species.\n') parser.print_help() # Check if the run already exists spFile = pairsFile = None # SUGGEST THE USER TO USE THE UPDATE FEATURE spFile = os.path.join(outDir, 'species.txt') # check if it is an update or not if os.path.isfile(spFile) and (not owOrthoTbls): # fill a set with the new species names newfSet = set() flist = os.listdir(inDir) for f in flist: if f.startswith('.DS_'): continue newfSet.add(f) # now create a set from old species list oldfSet = set() for ln in open(spFile): oldfSet.add(ln.rstrip('\n')) # make the union of the two sets unionSet = oldfSet.union(newfSet) # if the length is different than suggest the user to use # the update or overwrite option if len(unionSet) != oldfSet: if updateId is None: sys.stderr.write( '\n\nThe file with species already exists, but the new species list is different from the existing one.' ) sys.stderr.write( '\nThis suggests that you have added, or removed species from the input directory.' ) sys.stderr.write( '\nPlease consider using the \'--update update_name\' option.' ) sys.stderr.write( '\nAlternatively you could completely overwrite a previous run using the \'--overwrite\' option.\n' ) sys.exit(-6) else: spFile = None # start the processing update_run = False if updateId is None: if multiOnly: #skip everything and execute directly multi-species clustering spFile = '%sspecies.txt' % outDir pairsFile = '%sspecies_pairs.txt' % outDir else: spFile, pairsFile = orthodetect.run_sonicparanoid2_multiproc( inDir, outDir=outDir, threads=cpus, sharedDir=alignDir, mmseqsDbDir=dbDirectory, create_idx=idx_dbs, sensitivity=sensitivity, cutoff=coff, confCutoff=0.05, lenDiffThr=args.max_len_diff, overwrite_all=overwrite, overwrite_tbls=owOrthoTbls, update_run=update_run, keepAlign=args.keep_raw_alignments, debug=debug) #run multi-species clustering if not skipMulti: #copy sqltables multiOutDir = os.path.join(outDir, 'multi_species/') sqlPaths = orthodetect.fetch_sql_files(rootDir=outDir, outDir=multiOutDir, pairsFile=pairsFile, coreOnly=False, debug=debug) print('Ortholog tables loaded for multi-species orthology:\t%d' % len(sqlPaths)) sys.stdout.write('\nCreating ortholog groups...') multisp_start = time.perf_counter() quickparaRoot = orthodetect.get_quick_multiparanoid_src_dir() #create the multi-species clusters orthodetect.run_quickparanoid(sqlTblDir=multiOutDir, outDir=multiOutDir, srcDir=quickparaRoot, outName=multiSpeciesClstrNameAll, speciesFile=spFile, maxGenePerSp=maxGenePerSp, debug=debug) sys.stdout.write( 'Ortholog groups creation elapsed time (seconds):\t{:s}\n'. format(str(round(time.perf_counter() - multisp_start, 3)))) # output directory for stats relDict = '%sortholog_relations/' % outDir # calculate stats on the generated clusters #orthodetect.calc_ortholog_group_stats(rootDir=outDir, outDir=relDict, outName=None, pairsFile=pairsFile, debug=debug) # ALL orthoRelName = 'ortholog_pairs.tsv' if args.qfo_2011: orthoRelName = 'ortholog_pairs_benchmark.tsv' # generate the relations if output_relations: orthodetect.extract_ortholog_pairs(rootDir=outDir, outDir=relDict, outName=orthoRelName, pairsFile=pairsFile, coreOnly=False, splitMode=args.qfo_2011, debug=debug) else: #Update the database update_run = True spFile, pairsFile = orthodetect.run_sonicparanoid2_multiproc( inDir, outDir=outDir, threads=cpus, sharedDir=alignDir, mmseqsDbDir=dbDirectory, create_idx=idx_dbs, sensitivity=sensitivity, cutoff=coff, confCutoff=0.05, lenDiffThr=args.max_len_diff, overwrite_all=overwrite, overwrite_tbls=owOrthoTbls, update_run=update_run, keepAlign=args.keep_raw_alignments, debug=debug) #run multi-species clustering if not skipMulti: #copy sqltables multiOutDir = os.path.join(outDir, '{:s}/'.format(updateId)) sqlPaths = orthodetect.fetch_sql_files(rootDir=outDir, outDir=multiOutDir, pairsFile=pairsFile, coreOnly=False, debug=debug) print('Ortholog tables loaded for multi-species orthology:\t%d' % len(sqlPaths)) sys.stdout.write('\nCreating ortholog groups...') multisp_start = time.perf_counter() quickparaRoot = orthodetect.get_quick_multiparanoid_src_dir() #create the multi-species clusters orthodetect.run_quickparanoid(sqlTblDir=multiOutDir, outDir=multiOutDir, srcDir=quickparaRoot, outName=multiSpeciesClstrNameAll, speciesFile=spFile, maxGenePerSp=maxGenePerSp, debug=debug) sys.stdout.write( 'Ortholog groups creation elapsed time (seconds):\t{:s}\n'. format(str(round(time.perf_counter() - multisp_start, 3)))) # extract ortholog pairs relDict = '%sortholog_relations/' % outDir orthoRelName = '{:s}_relations.tsv'.format(updateId) if args.qfo_2011: orthoRelName = '{:s}_relations_benchmark.tsv'.format(updateId) if output_relations: orthodetect.extract_ortholog_pairs(rootDir=outDir, outDir=relDict, outName=orthoRelName, pairsFile=pairsFile, coreOnly=False, splitMode=args.qfo_2011, debug=debug) ex_end = round(time.perf_counter() - ex_start, 3) sys.stdout.write( '\nTotal elapsed time (seconds):\t{:0.3f}\n'.format(ex_end)) # remove not required files cleanup(rootDir=outDir, debug=debug)
def process_multisp_tbl(inTbl: str, outPath: str, debug: bool = False) -> None: """Check consistency of table with ortholog groups and extract main stats.""" if debug: print('process_multisp_tbl :: START') print('Input ortholog groups table:\t{:s}'.format(inTbl)) print('Output stats file:\t{:s}'.format(outPath)) #check that the input directory is valid if not os.path.isfile(inTbl): sys.stderr.write( '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n' ) sys.exit(-2) # create the directory that will contain the output file if required systools.makedir(os.path.dirname(outPath)) # start processing the ortholog groups fd = open(inTbl, 'r') # extract the head and check rthe validity of the input file hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t') # check the hdr if not hdr_columns[0] == 'group_id': sys.stderr.write('\nERROR: the header is not a valid.\n') sys.exit( 'Make sure that the ortholog groups file was generated using SonicParanoid.' ) spCntStr: str = str(len(hdr_columns[4:-1]) / 2) spCntStr = spCntStr.strip() # check that the number of species is valid, for example not column was removed from the file # in thise case the diction must give a float with ending with '.0' if not spCntStr.endswith('.0'): sys.stderr.write( '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n' ) sys.exit(-2) # convert the string to int spCnt: int = int(spCntStr.split('.', 1)[0]) # variables to store the counts totCnt: int = 0 allSpCnt: int = 0 twoSpCnt: int = 0 mostSeedsId: str maxSeedsCnt: int = 0 # start looping through the clusters for clstr in fd: flds: List[str] = clstr.rstrip('\n').split('\t') totCnt += 1 clstrId: str = flds[0] # check if it contains all species if int(flds[2]) == spCnt: allSpCnt += 1 elif int(flds[2]) == 2: twoSpCnt += 1 # find the cluster with the high amount of orthologs with confidence 1.0 seedsCnt = int(flds[3]) if seedsCnt > maxSeedsCnt: maxSeedsCnt = seedsCnt mostSeedsId = clstrId fd.close() # variables with allSp pct allSpPct: float = round(float(allSpCnt / totCnt) * 100., 2) twoSpPct: float = round(float(twoSpCnt / totCnt) * 100., 2) # open the output file ofd = open(outPath, 'w') ofd.write('Stats for the ortholog groups file:\n{:s}\n'.format(inTbl)) ofd.write('\nClusters:\t{:d}'.format(totCnt)) ofd.write('\nSpecies:\t{:d}'.format(spCnt)) ofd.write('\nClusters with all species:\t{:d}'.format(allSpCnt)) ofd.write('\nPercentage of clusters with all species:\t{:10.2f}'.format( allSpPct)) ofd.write('\nClusters with two species:\t{:d}'.format(twoSpCnt)) ofd.write('\nPercentage of clusters with two species:\t{:10.2f}'.format( twoSpPct)) ofd.write('\nCluster with highest number of main orthologs:\t{:s}'.format( mostSeedsId)) ofd.close()
def extract_fasta(clstrDict: Dict[str, Dict[str, List[str]]], fastaDir: str, outDir: str, multiFasta: bool = False, annotationDict: Dict[str, List[List[str]]] = {}, debug: bool = False) -> None: """Extract FASTA sequences for echa cluster.""" if debug: print('\nextract_fasta :: START') print('Cluster for which sequences will be extracted:\t{:d}'.format( len(clstrDict))) print('Directory with the species files: {:s}'.format(fastaDir)) print('Output directory: {:s}'.format(outDir)) print('Output multiple FASTA files: {:s}'.format(str(multiFasta))) print('Length of annotation dictionary: {:d}'.format( len(annotationDict))) annotate: bool = False if len(annotationDict) > 0: annotate = True # check the directory with the fasta files exist if not os.path.isdir(fastaDir): sys.stderr.write( '\nERROR (file not found): you must provide a valid path to the directory containig the species files.\n' ) sys.exit(-2) else: # make sure it is not empty tmpList: List[str] = os.listdir(fastaDir) if len(tmpList) < 2: sys.stderr.write( '\nERROR: the directory containig the species files must contain at least two FASTA files.\n' ) sys.exit(-5) # will contain the species names that are actually required requSpDict: Dict[str, str] = {} # create the list with required species files for clstr, sp2geneDict in clstrDict.items(): for sp, orthoList in sp2geneDict.items(): # only process if required if sp in requSpDict: continue else: # make sure there is at least one ortholog for the current species if len(orthoList) == 1: # it could be empty if orthoList[0][0] == '*': # then it is empty continue # add the specis to the dictionary tmpPath: str = os.path.join(fastaDir, sp) requSpDict[sp] = tmpPath if not os.path.isfile(tmpPath): sys.stderr.write( '\nERROR (file not found): the species file for {:s} was not found at\n{:s}\nplease provide a valid path.\n' .format(sp, tmpPath)) sys.exit(-2) # load all the sequences in a dictionary # example, tvaginalis -> geneXAB -> ATGTAGGTA seqsDict: Dict[str, Dict[str, str]] = {} for spFile, fastaPath in requSpDict.items(): spName: str = os.path.basename(spFile) seqsDict[spName] = load_seqs_in_dict(fastaPath=fastaPath, debug=debug) # queue to contain the paths of each single tmpDq: Deque[str] # now generate the output files # separated in directories separated by cluster id # and each file named clustersId-species_name for clstr, sp2geneDict in clstrDict.items(): # create the output directory tmpClstrDir: str = os.path.join(outDir, 'clstr{:s}/'.format(clstr)) systools.makedir(tmpClstrDir) # now for each species extract the sequences if multiFasta: #write one fasta file for each species for sp, orthoList in sp2geneDict.items(): # skip the creation of files if the cluster is empty if len(orthoList) == 1: if orthoList[0][0] == '*': continue tmpFastaName = 'clstr{:s}-{:s}.fasta'.format(clstr, sp) tmpOutPath = os.path.join(tmpClstrDir, tmpFastaName) ofd = open(tmpOutPath, 'w') # write the sequences for ortho in orthoList: if annotate: # create the header by merging the annotations newHdr: str if ortho in annotationDict: # sometimes no annotation is found! annotLists = annotationDict[ortho] newHdr = '|'.join( [';'.join(l) for l in annotLists]) ofd.write('>{:s}\n'.format(newHdr)) else: ofd.write('>{:s}\n'.format(ortho)) else: ofd.write('>{:s}\n'.format(ortho)) # write the sequence ofd.write('{:s}\n'.format(str(seqsDict[sp][ortho]))) ofd.close() else: #write a single FASTA file tmpFastaName = 'clstr{:s}.fasta'.format(clstr) tmpOutPath = os.path.join(tmpClstrDir, tmpFastaName) ofd = open(tmpOutPath, 'w') for sp, orthoList in sp2geneDict.items(): # skip the creation of files if the cluster is empty if len(orthoList) == 1: if orthoList[0][0] == '*': continue # write the sequences for ortho in orthoList: if annotate: # create the header by merging the annotations newHdr: str if ortho in annotationDict: # sometimes no annotation is found! annotLists = annotationDict[ortho] newHdr = '|'.join( [';'.join(l) for l in annotLists]) ofd.write('>{:s}\n'.format(newHdr)) else: ofd.write('>{:s}\n'.format(ortho)) else: ofd.write('>{:s}\n'.format(ortho)) # write the sequence ofd.write('{:s}\n'.format(str(seqsDict[sp][ortho]))) ofd.close()
def extract_by_sp_cnt(inTbl: str, min: int = 2, max: int = 2, outDir: str = os.getcwd(), minConf: float = 0.1, debug: bool = False) -> Dict[str, Dict[str, List[str]]]: """Extract clusters based on on the number of species of which they are composed.""" if debug: print('\nextract_by_sp_cnt :: START') print('Input groups table:\t{:s}'.format(inTbl)) print('Minimum number of species in cluster:\t{:d}'.format(min)) print('Maximum number of species in cluster:\t{:d}'.format(max)) print('Output directory: {:s}'.format(outDir)) print('Minimum confidence for orthologs:\t{:.2f}'.format(minConf)) #check that the input directory is valid if not os.path.isfile(inTbl): sys.stderr.write( '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n' ) sys.exit(-2) # check the minimum confidence value if not (0.05 <= minConf <= 1.): sys.stderr.write( '\nWARNING: the ortholog confidence threshold must be set to a value between 0.05 and 1.0.\n' ) sys.stderr.write('It will now be set to 0.1.\n') min = max # start processing the ortholog groups fd = open(inTbl, 'r') # extract the head and check rthe validity of the input file hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t') # check the hdr if not hdr_columns[0] == 'group_id': sys.stderr.write('\nERROR: {:s}\nis not a valid header.\n') sys.exit( 'Make sure that the ortholog groups file was generated using SonicParanoid.' ) spCntStr: str = str(len(hdr_columns[4:-1]) / 2) spCntStr = spCntStr.strip() # check that the number of species is valid, for example not column was removed from the file # in thise case the diction must give a float with ending with '.0' if not spCntStr.endswith('.0'): sys.stderr.write( '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n' ) sys.exit(-2) # convert the string to int spCnt: int = int(spCntStr.split('.', 1)[0]) # More species requested than those avaliable in the input clusters if min > spCnt: sys.stderr.write( '\nWARNING: {:d} species were found in the input table header, hence clusters with {:d} species cannot exist!.\n' .format(spCnt, max)) sys.stderr.write( 'Both minimum and maximum will be set to ({:d}).\n'.format(spCnt)) min = spCnt max = spCnt # min should lower than max! if min > max: sys.stderr.write( '\nWARNING: the minimum number of species ({:d}) is higher than the maximum number of species ({:d}).\n' .format(min, max)) sys.stderr.write( 'Max will be set to the maximum number of species in the table ({:d}).\n' .format(spCnt)) max = spCnt # extract the species list spList: List[str] = [] # will contain the species names for i, el in enumerate(hdr_columns[4:-1]): if i % 2 == 0: spList.append(el) # prepare the output file if outDir[0] != '/': outDir += '/' outPath: str = os.path.join( outDir, 'filtered_min{:d}_max{:d}_{:s}'.format(min, max, os.path.basename(inTbl))) # create the output directory if required systools.makedir(outDir) ofd = open(outPath, 'w') # write the header ofd.write('{:s}\n'.format('\t'.join(hdr_columns))) # output dictionary # example: clst105 -> tvaginalis -> [g1, g4, g5] outDict: Dict[str, Dict[str, List[str]]] = {} extractedClstrCnt: int = 0 extractedGenesCnt: int = 0 totCnt: int = 0 tmpSp: str = '' # start looping through the clusters for clstr in fd: flds: List[str] = clstr.rstrip('\n').split('\t') totCnt += 1 clstrId: str = flds[0] spSize: int = int(flds[2]) # check if it contains all species if min <= spSize <= max: # write the filtered output file ofd.write(clstr) # keep only the usable fields flds = flds[4:-1] # add the id to output dictionary outDict[clstrId] = {} for i, el in enumerate(flds): # extract the cluster if i % 2 == 0: # example of cluster # 2336_Q9X2I8,2336_Q9X172:0.159 # create the list for the species tmpSp = spList[int(i / 2)] outDict[clstrId][tmpSp] = [] for ortho in el.split(','): tmpFlds: List[str] = ortho.split(':') tmpConf: float # case in which multuple columns are the in the gen name if len(tmpFlds) > 3: if ortho[-1] == ':': # for example, x1ab: # it is an ortholog for sure outDict[clstrId][tmpSp].append(ortho) extractedGenesCnt += 1 continue else: # the final field is the confidence if float(tmpFlds[-1]) >= minConf: # extract and append the gene name outDict[clstrId][tmpSp].append(':'.join( tmpFlds[:-1])) extractedGenesCnt += 1 continue else: # simpler case if len(tmpFlds) == 2: if float(tmpFlds[-1]) >= minConf: outDict[clstrId][tmpSp].append(tmpFlds[0]) extractedGenesCnt += 1 else: # then the confidence must be 1.0 outDict[clstrId][tmpSp].append(tmpFlds[0]) if tmpFlds[0][0] != '*': extractedGenesCnt += 1 # increase the count of extracted clusters extractedClstrCnt += 1 fd.close() # close output file ofd.close() # print some debug line if debug: print('Extracted clusters:\t{:d}'.format(len(outDict))) print('Extracted genes:\t{:d}'.format(extractedGenesCnt)) print('Percentage of extracted clusters:\t{:.2f}'.format( round(float(extractedClstrCnt / totCnt) * 100., 2))) # return the main dictionary return outDict
def extract_by_id(inTbl: str, idList: List[str] = [], outDir: str = os.getcwd(), minConf: float = 0.1, debug: bool = False) -> Dict[str, Dict[str, List[str]]]: """Extract clusters based on on the number of species of which they are composed.""" if debug: print('\nextract_by_id :: START') print('Input groups table:\t{:s}'.format(inTbl)) print('Number of clusters to be extracted:\t{:d}'.format(len(idList))) print('IDs to be extracted:\t{:s}'.format(str(idList))) print('Output directory: {:s}'.format(outDir)) print('Minimum confidence for orthologs:\t{:.2f}'.format(minConf)) #check that the input directory is valid if not os.path.isfile(inTbl): sys.stderr.write( '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n' ) sys.exit(-2) # Check that ar least one id is in the list if len(idList) == 0: sys.stderr.write( '\nERROR: you must provide at one cluster ID to be extracted, while you have provided none.\n' ) sys.exit(-5) # check that there are no repeated IDs in the ID list tmpDict: Dict[str, None] = {} tmpList: List[str] = [] for el in idList: if not el in tmpDict: tmpDict[el] = None else: tmpList.append(el) # remove the repeated IDs if required if len(tmpList) > 0: for el in tmpList: idList.remove(el) sys.stderr.write( '\nWARNING: the following cluster IDs were repeated in the input ID list and were removed.' ) sys.stderr.write('\n{:s}'.format(str(tmpList))) sys.stderr.write( '\nThe ID list now contains {:d} cluster IDs.\n\n'.format( len(idList))) # remove the tmp structure del tmpDict tmpList.clear() # start processing the ortholog groups fd = open(inTbl, 'r') # extract the header and check the validity of the input file hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t') # check the hdr if not hdr_columns[0] == 'group_id': sys.stderr.write('\nERROR: {:s}\nis not a valid header.\n') sys.exit( 'Make sure that the ortholog groups file was generated using SonicParanoid.' ) # extract the species count spCntStr: str = str(len(hdr_columns[4:-1]) / 2) spCntStr = spCntStr.strip() # check that the number of species is valid, for example not column was removed from the file # in thise case the dictionary must give a float with ending with '.0' if not spCntStr.endswith('.0'): sys.stderr.write( '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n' ) sys.exit(-2) # convert the string to int spCnt: int = int(spCntStr.split('.', 1)[0]) # extract the species list spList: List[str] = [] # will contain the species names for i, el in enumerate(hdr_columns[4:-1]): if i % 2 == 0: spList.append(el) # prepare the output file outPath: str = os.path.join( outDir, 'filtered_{:s}'.format(os.path.basename(inTbl))) # create the output directory if required systools.makedir(outDir) ofd = open(outPath, 'w') # write the header ofd.write('{:s}\n'.format('\t'.join(hdr_columns))) # output dictionary and other variables # example: clst105 -> tvaginalis -> [g1, g4, g5] outDict: Dict[str, Dict[str, List[str]]] = {} extractedClstrCnt: int = 0 extractedGenesCnt: int = 0 totCnt: int = 0 tmpSp: str = '' # start looping through the clusters for clstr in fd: flds: List[str] = clstr.rstrip('\n').split('\t') totCnt += 1 clstrId: str = flds[0] # extract the information from the cluster if clstrId in idList: # write the filtered output file ofd.write(clstr) # keep only the usable fields flds = flds[4:-1] # add the id to output dictionary outDict[clstrId] = {} for i, el in enumerate(flds): # extract the cluster if i % 2 == 0: # example of cluster # 2336_Q9X2I8,2336_Q9X172:0.159 # create the list for the species tmpSp = spList[int(i / 2)] outDict[clstrId][tmpSp] = [] for ortho in el.split(','): tmpFlds: List[str] = ortho.split(':') tmpConf: float # case in which multuple columns are the in the gen name if len(tmpFlds) > 3: if ortho[-1] == ':': # for example, x1ab: # it is an ortholog for sure outDict[clstrId][tmpSp].append(ortho) if outDict[clstrId][tmpSp][-1][0] != '*': extractedGenesCnt += 1 continue else: # the final field is the confidence if float(tmpFlds[-1]) >= minConf: # extract and append the gene name outDict[clstrId][tmpSp].append(':'.join( tmpFlds[:-1])) if outDict[clstrId][tmpSp][-1][0] != '*': extractedGenesCnt += 1 continue else: # simpler case if len(tmpFlds) == 2: if float(tmpFlds[-1]) >= minConf: outDict[clstrId][tmpSp].append(tmpFlds[0]) extractedGenesCnt += 1 else: # then the confidence must be 1.0 outDict[clstrId][tmpSp].append(tmpFlds[0]) if tmpFlds[0][0] != '*': extractedGenesCnt += 1 # remove the ID from the list idList.remove(clstrId) # increase the count of extracted clusters extractedClstrCnt += 1 fd.close() # close output file ofd.close() # print some debug line if debug: print('Extracted clusters:\t{:d}'.format(len(outDict))) if len(idList) > 0: print( '(WARNING) The following clusters were not found: {:s}'.format( str(idList))) print('Extracted genes:\t{:d}'.format(extractedGenesCnt)) print('Percentage of extracted clusters:\t{:.2f}'.format( round(float(extractedClstrCnt / totCnt) * 100., 2))) # return the main dictionary return outDict
def perform_parallel_orthology_inference(requiredPairsDict, inDir, outDir=os.getcwd(), sharedDir=None, cutoff=40, confCutoff=0.05, lenDiffThr=0.5, threads=8, debug=False): """Execute orthology inference for the required pairs.""" if debug: print('\nperform_parallel_orthology_inference :: START') print('Proteome pairs to be processed:\t{:d}'.format( len(requiredPairsDict))) print('Input directory:{:s}'.format(inDir)) print('Outdir:{:s}'.format(outDir)) print('Alignment directory:{:s}'.format(sharedDir)) print('Cutoff:\t{:d}'.format(cutoff)) print('Confidence cutoff for paralogs:\t{:s}'.format(str(confCutoff))) print('Length difference filtering threshold:\t{:s}'.format( str(lenDiffThr))) print('CPUs (for mmseqs):\t{:d}'.format(threads)) # make sure that the directory with alignments exists if not os.path.isdir(sharedDir): sys.stderr.write( 'ERROR: The directory with the alignment files\n%s\nwas not found, please provide a valid path\n' .format(sharedDir)) if not os.path.isdir(sharedDir): sys.stderr.write( 'ERROR: The directory with the input files\n{:s}\nwas not found, please provide a valid path\n' .format(inDir)) #create the output directory if does not exist yet if outDir != os.getcwd(): if not os.path.isdir(outDir): systools.makedir(outDir) if outDir[-1] != '/': outDir += '/' # check if the output directory differs from the input one if os.path.dirname(inDir) == os.path.dirname(outDir): sys.stderr.write( '\nERROR: the output directory {:s}\nmust be different from the one in which the input files are stored.\n' .format(outDir)) sys.exit(-2) # check cutoff if cutoff < 30: cutoff = 40 # create the queue and start adding the jobs jobs_queue = mp.Queue() # fill the queue with the processes for pair in requiredPairsDict: jobs_queue.put(pair) # add flags for eneded jobs for i in range(0, threads): jobs_queue.put(None) #sys.exit('DEBUG :: 3') # Queue to contain the execution time results_queue = mp.Queue(maxsize=len(requiredPairsDict)) # call the method inside workers runningJobs = [ mp.Process(target=consume_orthology_inference_jobs, args=(jobs_queue, results_queue, inDir, outDir, sharedDir, cutoff, confCutoff, lenDiffThr, threads, debug)) for i_ in range(threads) ] for proc in runningJobs: #print('Start job\t{}'.format(proc)) proc.start() # open the file in which the time information will be stored execTimeOutPath = os.path.join( sharedDir, 'orthology_ex_time_{:s}.tsv'.format( os.path.basename(outDir.rstrip('/')))) ofd = open(execTimeOutPath, 'w', buffering=1) # get the results from the queue without filling the Pipe buffer while True: try: p, val = results_queue.get(False, 0.01) ofd.write('{:s}\t{:s}\n'.format(p, str(val))) except queue.Empty: pass allExited = True for t in runningJobs: if t.exitcode is None: allExited = False break if allExited & results_queue.empty(): break ofd.close() for proc in runningJobs: while proc.is_alive(): proc.join()
def mmseqs_search(inSeq, dbSeq, dbDir=os.getcwd(), outDir=os.getcwd(), tmpDirName=None, sensitivity=4.0, evalue=1000, threads=4, cleanUp=False, debug=False): """Align protein sequences using mmseqs2.""" if debug: print('\nmmseqs_search :: START') print('Input query FASTA file:\t%s' % inSeq) print('Input target FASTA file:\t%s' % dbSeq) print('mmseqs2 database directory:\t%s' % dbDir) print('Output directory:\t%s' % outDir) print('MMseqs2 tmp directory:\t{:s}'.format(tmpDirName)) print('MMseqs2 sensitivity (-s):\t%s' % str(sensitivity)) print('Threads:\t%d' % threads) print('Remove temporary files:\t%s' % cleanUp) #check that the input file and the database exist if not os.path.isfile(inSeq): sys.stderr.write( 'The query file %s was not found, please provide the path to a valid FASTA file' % inSeq) sys.exit(-2) if not os.path.isfile(dbSeq): sys.stderr.write( 'The target file %s was not found, please provide the path to a valid FASTA file' % dbSeq) sys.exit(-2) # check sensitivity if (sensitivity < 1) or sensitivity > 8.5: sys.stderr.write( '\nERROR: the sensitivity value for MMseqs2.0 must be a value between 1.0 and 8.5.\n' ) sys.exit(-5) # create directory if not previously created systools.makedir(outDir) systools.makedir(dbDir) # set the tmp dir tmpDir = None if tmpDirName is None: tmpDir = '%stmp_mmseqs/' % outDir else: tmpDir = '{:s}{:s}/'.format(outDir, tmpDirName) systools.makedir(tmpDir) # check the query db name queryDBname = os.path.basename(inSeq) queryDBname = queryDBname.split('.')[ 0] # take the left part of the file name queryDBname = '%s.mmseqs2db' % queryDBname queryDBpath = '%s%s' % (dbDir, queryDBname) # create the database if does not exist yet if not os.path.isfile(queryDBpath): mmseqs_createdb(inSeq, outDir=dbDir, debug=debug) mmseqs_createindex(queryDBpath, debug=debug) # check the target db name targetDBname = os.path.basename(dbSeq) targetDBname = targetDBname.split('.')[ 0] # take the left part of the file name targetDBname = '%s.mmseqs2db' % targetDBname targetDBpath = '%s%s' % (dbDir, targetDBname) # create the database if does not exist yet if not os.path.isfile(targetDBpath): mmseqs_createdb(dbSeq, outDir=dbDir, debug=debug) mmseqs_createindex(targetDBpath, debug=debug) # set output name pairName = '%s-%s' % (os.path.basename(inSeq), os.path.basename(dbSeq)) rawOutName = 'mmseqs2raw.%s' % pairName rawOutPath = '%s%s' % (outDir, rawOutName) blastOutName = 'mmseqs2blast.%s' % pairName blastOutPath = '%s%s' % (outDir, blastOutName) # start measuring the execution time # use perf_counter (includes time spent during sleep) start_time = time.perf_counter() # command to be executed minUngappedScore = 15 # EXAMPLE; mmseqs search queryDBfile targetDBfile outputFile tmpDir -s 7.5 -e 100000 --theads threads searchCmd = '{:s} search {:s} {:s} {:s} {:s} -s {:s} --threads {:d} -v 0 --min-ungapped-score {:d} --alignment-mode 2 --alt-ali 10'.format( get_mmseqs_path(), queryDBpath, targetDBpath, rawOutPath, tmpDir, str(sensitivity), threads, minUngappedScore) if debug: print('mmseqs2 search CMD:\t%s' % searchCmd) # use run (or call) subprocess.run(searchCmd, env=my_env, shell=True) # output an error if the Alignment did not finish correctly if not os.path.isfile(rawOutPath): sys.stderr.write( '\nERROR: the MMseqs2 raw alignment file was not generated.\n') sys.exit(-2) # stop counter # use perf_counter (includes time spent during sleep) end_search = time.perf_counter() # use process_time (user + system CPU time, no sleep time) #end_search = time.process_time() search_time = round(end_search - start_time, 2) # convert the output to tab-separated BLAST output # EXAMPLE: mmseqs convertalis query.db target.db query_target_rawout query_target_blastout convertCmd = '%s convertalis %s %s %s %s -v 0 --format-mode 0' % ( get_mmseqs_path(), queryDBpath, targetDBpath, rawOutPath, blastOutPath) # perform the file conversion subprocess.run(convertCmd, env=my_env, shell=True) if debug: print('mmseqs2 convertalis CMD:\t%s' % convertCmd) # exec time conversion #convert_time = round(time.time() - end_search, 2) # use perf_counter (includes time spent during sleep) convert_time = round(time.perf_counter() - end_search, 2) # use process_time (user + system CPU time, no sleep time) #convert_time = round(time.process_time() - end_search, 2) # cleanup output directory if cleanUp: mmseqs_cleanup(inDir=outDir, debug=debug) # output an error if the Alignment could not be converted if not os.path.isfile(blastOutPath): sys.stderr.write( '\nERROR: the MMseqs2 raw alignment could not be converted inot the BLAST alignment format.\n' ) sys.exit(-2) return (blastOutPath, search_time, convert_time)
def consume_orthology_inference_jobs(jobs_queue, results_queue, inDir, outDir=os.getcwd(), sharedDir=None, cutoff=40, confCutoff=0.05, lenDiffThr=0.5, threads=8, debug=False): """Perform orthology inference in parallel.""" while True: current_pair = jobs_queue.get(True, 1) if current_pair is None: break # create the output directory iof needed # prepare the run sp1, sp2 = current_pair.split('-', 1) runDir = os.path.join(outDir, current_pair) systools.makedir(runDir) inSp1 = os.path.join(inDir, sp1) inSp2 = os.path.join(inDir, sp2) # check that the input files do exist if not os.path.isfile(inSp1): sys.stderr.write( 'ERROR: The input file for {:s} was not found, please provide a valid path\n.' .format(sp1)) if not os.path.isfile(inSp2): sys.stderr.write( 'ERROR: The input file for {:s} was not found, please provide a valid path\n.' .format(sp2)) # prepare the names of the required alignments # copy AA AA = '{:s}-{:s}'.format(sp1, sp1) shPathAA = os.path.join(sharedDir, AA) if not os.path.isfile(shPathAA): sys.stderr.write( 'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.' .format(AA)) # copy BB BB = '{:s}-{:s}'.format(sp2, sp2) shPathBB = os.path.join(sharedDir, BB) if not os.path.isfile(shPathBB): sys.stderr.write( 'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.' .format(BB)) # copy AB AB = '{:s}-{:s}'.format(sp1, sp2) shPathAB = os.path.join(sharedDir, AB) if not os.path.isfile(shPathAB): sys.stderr.write( 'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.' .format(AB)) # copy BA BA = '{:s}-{:s}'.format(sp2, sp1) shPathBA = os.path.join(sharedDir, BA) if not os.path.isfile(shPathBA): sys.stderr.write( 'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.' .format(BA)) #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: after files copy') # prepare paths for output tables outTable = os.path.join(runDir, 'table.{:s}'.format(current_pair)) outSql = os.path.join(runDir, 'sqltable.{:s}'.format(current_pair)) # infer orthologs # use perf_counter (includes time spent during sleep) orthology_prediction_start = time.perf_counter() inpyranoid.infer_orthologs(inSp1, inSp2, alignDir=sharedDir, outDir=runDir, confCutoff=confCutoff, lenDiffThr=lenDiffThr, debug=False) #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: after orthology') #check that all the files have been created if not os.path.isfile(outTable): sys.stderr.write( 'WARNING: the ortholog table file %s was not generated.' % outTable) outTable = None if not os.path.isfile(outSql): sys.stderr.write('WARNING: the SQL table %s was not generated.' % outSql) outSql = None #everything went ok! # use perf_counter (includes time spent during sleep) end_time = time.perf_counter() orthology_prediction_tot = round(end_time - orthology_prediction_start, 2) #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: orthology done!') # add the execution time to the results queue results_queue.put((current_pair, str(orthology_prediction_tot))) if debug: sys.stdout.write( '\nOrthology prediction {:s} (seconds):\t{:s}\n'.format( current_pair, str(orthology_prediction_tot)))
def filter_ortholog_table(abTbl, a, b, outDir=os.getcwd(), lenThr=0.25, debug=False): '''Filter ortholog table based on sequence legths.''' if debug: print('filter_ortholog_table :: START') print('Ortholog table: %s' % abTbl) print('Proteome A: %s' % a) print('Proteome B: %s' % b) print('Output directory: %s' % outDir) print('Length difference threshold: %s' % str(lenThr)) # load sequence lengths for A and B lenDictA = load_seq_lengths(a, debug) lenDictB = load_seq_lengths(b, debug) # load the information from a clusters # EXAMPLE #OrtoId Score OrtoA OrtoB #1 1163 1423_Q9KWU4 1.0 9606_P11498 1.0 #2 963 1423_P09339 1.0 9606_P21399 1.0 9606_P48200 0.201 # these will contain info for paralogs for each of the 2 species if outDir[-1] != '/': outDir = '%s/' % outDir # check that the output directory is different from the one of the input file inDir = '%s/' % os.path.dirname(abTbl) if inDir == outDir: sys.exit( 'The output directory must be different from that of the input table.' ) # create output directory if needed systools.makedir(outDir) # new table path abTblNew = '%s%s' % (outDir, os.path.basename(abTbl)) # rejected list file path rjctTbl = '%s%s' % (outDir, os.path.basename(abTbl.replace('table.', 'rejected.'))) # new table path abTblNew = '%s%s' % (outDir, os.path.basename(abTbl)) # rejected list file path rjctTbl = '%s%s' % (outDir, os.path.basename(abTbl.replace('table.', 'rejected.'))) # open output files ofdNewTbl = open(abTblNew, 'w') ofdRjct = open(rjctTbl, 'w') # count read and wrote genes orthoRdCntA = inparaRdCntA = orthoRdCntB = inparaRdCntB = 0 orthoWrtCntA = inparaWrtCntA = orthoWrtCntB = inparaWrtCntB = 0 for ln in open(abTbl): if ln[0] == 'O': ofdNewTbl.write(ln) continue ln = ln.rstrip('\n') clstrId, score, paraA, paraB = ln.split('\t') # extract orthologs and inparalogs from A orthoListA, inparaDictRawA = extract_paralogs(paraA, debug=debug) orthoRdCntA += len(orthoListA) inparaRdCntA += len(inparaDictRawA) # If there are InParalogs then check if they should be kept or not keptInpaListA = [] droppedInpaDictA = OrderedDict() if len(inparaDictRawA): #print('ClstrID:\t%s'%clstrId) # set ortholog length for A lenListOrthoA = [] if len(orthoListA) > 1: lenListOrthoA = calc_ortholog_leghths(orthoListA, lenDictA, debug=debug) elif len(orthoListA) == 0: sys.exit('ERROR: at least one ortholog must be found!') else: # add the only available length lenListOrthoA.append(lenDictA[orthoListA[0]]) # filter Inparalogs from A droppedInpaDictA, keptInpaListA = filter_inparalogs( inparaDictRawA, lenDictA, orthoLenList=lenListOrthoA, lenRatioThr=lenThr, debug=debug) ''' if clstrId == '777': print(orthoListA) print(inparaDictRawA) print(lenListOrthoA) print(droppedInpaDictA) print(keptInpaListA) sys.exit('Test single cluster') #''' # extract orthologs and inparalogs from B orthoListB, inparaDictRawB = extract_paralogs(paraB, debug=debug) orthoRdCntB += len(orthoListB) inparaRdCntB += len(inparaDictRawB) # If there are InParalogs then check if they should be kept or not keptInpaListB = [] droppedInpaDictB = OrderedDict() if len(inparaDictRawB): # set ortholog length for B lenListOrthoB = [] if len(orthoListB) > 1: lenListOrthoB = calc_ortholog_leghths(orthoListB, lenDictB, debug=debug) elif len(orthoListB) == 0: sys.exit('ERROR: at least one ortholog must be found!') else: # add the only available length lenListOrthoB.append(lenDictB[orthoListB[0]]) # filter Inparalogs from B droppedInpaDictB, keptInpaListB = filter_inparalogs( inparaDictRawB, lenDictB, orthoLenList=lenListOrthoB, lenRatioThr=lenThr, debug=debug) # START WRITING THE NEW TABLE ofdNewTbl.write('%s\t%s\t' % (clstrId, score)) # Write the output cluster for A # Write cores orthologs for A tmpLnList = [] for orthoTmpGene in orthoListA: tmpLnList.append('%s 1.0' % (orthoTmpGene)) ofdNewTbl.write(' '.join(tmpLnList)) orthoWrtCntA += len(tmpLnList) # Write rejected inparalogs for k in droppedInpaDictA: tmpLenDiff, tmpConf, inparaVsOrthoRatio = droppedInpaDictA[k] ofdRjct.write( '%s\t%s\t%s\t%s\t%s\t%s\n' % (clstrId, score, k, tmpConf, tmpLenDiff, inparaVsOrthoRatio)) # reset tmp list tmpLnList.clear() # write valid inparalogs to cluster for tmpInparaA in keptInpaListA: tmpLnList.append('%s %s' % (tmpInparaA, inparaDictRawA[tmpInparaA])) if len(tmpLnList) > 0: ofdNewTbl.write(' %s' % (' '.join(tmpLnList))) inparaWrtCntA += len(tmpLnList) tmpLnList.clear() # now start writing the right part of the cluster ofdNewTbl.write('\t') # Write core orthologs for B tmpLnList.clear() for orthoTmpGene in orthoListB: tmpLnList.append('%s 1.0' % (orthoTmpGene)) ofdNewTbl.write(' '.join(tmpLnList)) orthoWrtCntB += len(tmpLnList) # Write rejected inparalogs for k in droppedInpaDictB: tmpLenDiff, tmpConf, inparaVsOrthoRatio = droppedInpaDictB[k] ofdRjct.write( '%s\t%s\t%s\t%s\t%s\t%s\n' % (clstrId, score, k, tmpConf, tmpLenDiff, inparaVsOrthoRatio)) # reset tmp list tmpLnList.clear() # write valid inparalogs to cluster for tmpInparaB in keptInpaListB: tmpLnList.append('%s %s' % (tmpInparaB, inparaDictRawB[tmpInparaB])) if len(tmpLnList) > 0: ofdNewTbl.write(' %s' % (' '.join(tmpLnList))) inparaWrtCntB += len(tmpLnList) tmpLnList.clear() # close the cluster line ofdNewTbl.write('\n') ###### TEST ######### ''' # Try to rewrite the table as it was originally ofdNewTbl.write('%s\t%s\t'%(clstrId, score)) tmpLnList = [] # write cores orthologs for A for orthoTmpGene in orthoListA: tmpLnList.append('%s 1.0'%(orthoTmpGene)) ofdNewTbl.write(' '.join(tmpLnList)) orthoWrtCntA += len(tmpLnList) # reset tmp list tmpLnList.clear() for k in inparaDictRawA: tmpLnList.append('%s %s'%(k, inparaDictRawA[k])) if len(tmpLnList) > 0: ofdNewTbl.write(' %s'%(' '.join(tmpLnList))) inparaWrtCntA += len(tmpLnList) tmpLnList.clear() #''' ''' # now start with the right part of the cluster ofdNewTbl.write('\t') # write the orthologs for B for orthoTmpGene in orthoListB: tmpLnList.append('%s 1.0'%(orthoTmpGene)) ofdNewTbl.write(' '.join(tmpLnList)) orthoWrtCntB += len(tmpLnList) # reset tmp list tmpLnList.clear() # write InParalogs for k in inparaDictRawB: tmpLnList.append('%s %s'%(k, inparaDictRawB[k])) if len(tmpLnList) > 0: ofdNewTbl.write(' %s'%(' '.join(tmpLnList))) inparaWrtCntB += len(tmpLnList) ofdNewTbl.write('\n') ###################### #''' # close output files ofdNewTbl.close() ofdRjct.close() if debug: print('\nRead orthologs A/B; inparalogs A/B:') print('%d\t%d\t%d\t%d' % (orthoRdCntA, orthoRdCntB, inparaRdCntA, inparaRdCntB)) print('\nWritten orthologs A/B; inparalogs A/B:') print('%d\t%d\t%d\t%d' % (orthoWrtCntA, orthoWrtCntB, inparaWrtCntA, inparaWrtCntB))