def GetIDsDict(orthofinderWorkingDir): # sequence IDs idsFilename = orthofinderWorkingDir + "SequenceIDs.txt" try: idExtract = util.FirstWordExtractor(idsFilename) idDict = idExtract.GetIDToNameDict() except RuntimeError as error: print(error.message) if error.message.startswith("ERROR"): print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename)) util.Fail() else: print("Tried to use only the first part of the accession in order to list the sequences in each orthogroup more concisely but these were not unique. Will use the full accession line instead.") try: idExtract = util.FullAccession(idsFilename) idDict = idExtract.GetIDToNameDict() except: print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename)) util.Fail() # species names speciesDict = dict() with open(orthofinderWorkingDir + "SpeciesIDs.txt", 'rb') as idsFile: for line in idsFile: iSpecies, filename = line.rstrip().split(": ", 1) iSpecies = iSpecies.replace("#", "") speciesName = os.path.splitext(os.path.split(filename)[1])[0] speciesDict[iSpecies] = speciesName idDict = {seqID:speciesDict[seqID.split("_")[0]] + "_" + name for seqID, name in idDict.items()} return idDict
def GetSpeciesSequenceIDsDict(sequenceIDsFilename, speciesIDsFN=None): try: extract = util.FirstWordExtractor(sequenceIDsFilename) except RuntimeError as error: print(error.message) if error.message.startswith("ERROR"): util.Fail() else: print( "Tried to use only the first part of the accession in order to list the sequences in each orthogroup\nmore concisely but these were not unique. The full accession line will be used instead.\n" ) extract = util.FullAccession(sequenceIDsFilename) idsDict = extract.GetIDToNameDict() if speciesIDsFN != None: speciesDict = util.FullAccession(speciesIDsFN).GetIDToNameDict() speciesDict = { k: v.rsplit(".", 1)[0].replace(".", "_").replace(" ", "_") for k, v in speciesDict.items() } idsDict = { seqID: speciesDict[seqID.split("_")[0]] + "_" + name for seqID, name in idsDict.items() } return idsDict
For details please see the License.md that came with this software.\n""") if len(sys.argv) == 1 or sys.argv[1] == "--help" or sys.argv[1] == "help" or sys.argv[1] == "-h": PrintHelp() sys.exit() # Get arguments userDir = None nProcesses = None args = sys.argv[1:] while len(args) != 0: arg = args.pop(0) if arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") util.Fail() arg = args.pop(0) try: nProcesses = int(arg) except: print("Incorrect argument for number of threads: %s" % arg) util.Fail() else: userDir = arg # Check arguments orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(userDir) if nProcesses == None: print("""Number of parallel processes has not been specified, will use the default value. Number of parallel processes can be specified using the -t option\n""")