Example #1
0
def GetIDsDict(orthofinderWorkingDir):
    # sequence IDs
    idsFilename = orthofinderWorkingDir + "SequenceIDs.txt"
    try:
        idExtract = util.FirstWordExtractor(idsFilename)
        idDict = idExtract.GetIDToNameDict()
    except RuntimeError as error:
        print(error.message)
        if error.message.startswith("ERROR"):
            print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename))
            util.Fail()
        else:
            print("Tried to use only the first part of the accession in order to list the sequences in each orthogroup more concisely but these were not unique. Will use the full accession line instead.")     
            try:
                idExtract = util.FullAccession(idsFilename)
                idDict = idExtract.GetIDToNameDict()
            except:
                print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename))
                util.Fail()
    
    # species names
    speciesDict = dict()
    with open(orthofinderWorkingDir + "SpeciesIDs.txt", 'rb') as idsFile:
        for line in idsFile:
            iSpecies, filename = line.rstrip().split(": ", 1)
            iSpecies = iSpecies.replace("#", "")
            speciesName = os.path.splitext(os.path.split(filename)[1])[0]
            speciesDict[iSpecies] = speciesName   
    idDict = {seqID:speciesDict[seqID.split("_")[0]] + "_" + name for seqID, name in idDict.items()}
    return idDict    
def GetSpeciesSequenceIDsDict(sequenceIDsFilename, speciesIDsFN=None):
    try:
        extract = util.FirstWordExtractor(sequenceIDsFilename)
    except RuntimeError as error:
        print(error.message)
        if error.message.startswith("ERROR"):
            util.Fail()
        else:
            print(
                "Tried to use only the first part of the accession in order to list the sequences in each orthogroup\nmore concisely but these were not unique. The full accession line will be used instead.\n"
            )
            extract = util.FullAccession(sequenceIDsFilename)
    idsDict = extract.GetIDToNameDict()
    if speciesIDsFN != None:
        speciesDict = util.FullAccession(speciesIDsFN).GetIDToNameDict()
        speciesDict = {
            k: v.rsplit(".", 1)[0].replace(".", "_").replace(" ", "_")
            for k, v in speciesDict.items()
        }
        idsDict = {
            seqID: speciesDict[seqID.split("_")[0]] + "_" + name
            for seqID, name in idsDict.items()
        }
    return idsDict
Example #3
0
    For details please see the License.md that came with this software.\n""")
    if len(sys.argv) == 1 or sys.argv[1] == "--help" or sys.argv[1] == "help" or sys.argv[1] == "-h":
        PrintHelp()
        sys.exit()
        
    # Get arguments    
    userDir = None
    nProcesses = None
    
    args = sys.argv[1:]    
    while len(args) != 0:
        arg = args.pop(0)
        if arg == "-t" or arg == "--threads":
            if len(args) == 0:
                print("Missing option for command line argument -t")
                util.Fail()
            arg = args.pop(0)
            try:
                nProcesses = int(arg)
            except:
                print("Incorrect argument for number of threads: %s" % arg)
                util.Fail()   
        else:
            userDir = arg
    
    # Check arguments
    orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(userDir)

    if nProcesses == None:
        print("""Number of parallel processes has not been specified, will use the default value.  
   Number of parallel processes can be specified using the -t option\n""")