def main(inputFile, maxPercentage, minLength, maxLength, SEG, cullOperationID='CullResults'):
    """Runs the protein culling.
    
    @param arguments: The command line arguments minus the name of the program called.
    @type arguments: A Python list.
    
    """
    
    # Initialise the variables that will be potentially altered by the command line arguments.
    cores = 2
    
    # Get the default location for the BLAST executables.
    cwd = os.getcwd()
    srcLocation = os.path.abspath(__file__)
    srcLocation = '/'.join(srcLocation.split('/')[:-1])
    outputLocation = srcLocation + '/' + cullOperationID
    if os.path.isdir(outputLocation):
        shutil.rmtree(outputLocation)
    elif os.path.exists(outputLocation):
        os.remove(outputLocation)
    os.umask(00000)
    os.mkdir(outputLocation)
    
    if inputFile == '' or maxPercentage == 0:
        # If this occurs then the user has either not specified an input file or a percentage threshold.
        # Both of these arguments are mandatory.
        help()
        sys.exit()
    elif maxPercentage < 5 or maxPercentage > 100:
        print 'The valid range for the maximum allowable percentage sequence'
        print 'similarity is 5 - 100.'
        sys.exit()
    
    # Ensure that the fasta file input is appropriately formatted.
    fileToBLAST = outputLocation + '/InputCopy.fasta'
    inputFileToLoad = open(inputFile, 'r')
    inputFile = inputFileToLoad.read()
    inputFileToLoad.close()
    errorCode, message = checkfastaformat.main(inputFile)
    if errorCode != 0:
        print message
        sys.exit()
    writeOut = open(fileToBLAST, 'w')
    writeOut.write(message)
    writeOut.close()
    
    # Perform the BLASTing.
    similarities = performBLAST.main(fileToBLAST, fileToBLAST, cullOperationID + '/BLASTOutput', SEG, cores)
    
    # Create the sparsematrix of the protein similarity graph.
    adjList, protNames = adjlistcreation.user_seq_main(similarities, maxPercentage)
    
    # Choose which protein to remove from the similarity graph.
    if protNames == []:
        proteinsToCull = []
    else:
        proteinsToCull, proteinsToKeep = Leafcull.main(adjList, protNames)
    
    # Write out the proteins that were removed.
    writeOutRem = open(outputLocation + '/Removed.txt', 'w')
    for i in proteinsToCull:
        writeOutRem.write(i + '\n')
    writeOutRem.close()
    
    # Write out a fasta file of the proteins kept.
    writeOutKeepFasta = open(outputLocation + '/KeptFasta.fasta', 'w')
    writeOutKeepList = open(outputLocation + '/KeptList.txt', 'w')
    writeOutKeepList.write('IDs\tlength\n')
    readFasta = open(fileToBLAST, 'r')
    recording = False
    uniqueProteins = []  # Used to ensure no duplicates somehow get through.
    for line in readFasta:
        if line[0] == '>' and not line[1:-1] in proteinsToCull and not line in uniqueProteins:
            # If the line starts a new protein definition, and that protein is one of the ones to keep.
            recording = True
            uniqueProteins.append(line)
            writeOutKeepFasta.write(line)
            writeOutKeepList.write(line[1:-1])
        elif line[0] == '>':
            # If the line start a new protein definition, but the protein is not one of the ones to keep.
            recording = False
        else:
            # Otherwise the line is a protein sequence.
            if recording:
                # If we are currently working on a protein that is being kept.
                writeOutKeepFasta.write(line)
                writeOutKeepList.write('\t' + str(len(line[:-1])) + '\n')
    readFasta.close()
    writeOutKeepList.close()
    writeOutKeepFasta.close()
    
    return outputLocation
def main():
    """Runs the protein culling.
    
    @param args: The command line arguments.
    @type args: A Python list.
    
    """
    
    #===========================================================================
    # Parse the user's input.
    #===========================================================================
    parser = argparse.ArgumentParser(description=('Generate a non-redundant dataset of sequences from a FASTA file of input sequences. ' +
                                                  'Please see the README for more information on how to use this program.'),
                                     epilog=('This program is designed to cull a dataset of protein sequences so that no ' +
                                             'two sequences have a sequence identity greater than the specified threshold ' +
                                             'percentage. The method used is the Leaf heuristic, which is described in PAPER. ' +
                                             'A server to perform the culling can be found at http://www.bioinf.manchester.ac.uk/leaf/.')
                                     )
    parser.add_argument('inputFile', help='The location of the input FASTA file. (Required type: %(type)s).', type=str)
    parser.add_argument('-p', '--percent', help='The maximum percent sequence identity between sequences 5 <= maxPercent < 100 must be true. (Required type: %(type)s, default value: %(default)s).',
                        metavar="maxPercent", type=float, default=20, required=False)
    parser.add_argument('-m', '--minLen', help='The maximum sequence length permissible. A negative value means not to use a minimum sequence length. Must not be greater than the maximum sequence length. (Required type: %(type)s, default value: Not Used).',
                        metavar="minLength", type=int, required=False, default=-1)
    parser.add_argument('-a', '--maxLen', help='The minimum sequence length permissible A negative value means not to use a maximum sequence length. Must not be less than the minimum sequence length. (Required type: %(type)s, default value: Not Used).',
                        metavar="maxLength", type=int, required=False, default=-1)
    parser.add_argument('-s', '--seg', help='Whether or not to run SEG prior to BLASTing. (Default value: Not used).',
                        action='store_true', default=False, required=False)
    parser.add_argument('-c', '--cores', help='The number of processor cores to use for BLASTing. (Required type: %(type)s, default value: %(default)s).',
                        metavar="cores", type=int, default=2, required=False)
    parser.add_argument('-o', '--output', help='The name of the output directory to create in the current working directory. (Required type: %(type)s, default value: %(default)s).',
                        metavar="outputFolder", type=str, default='CullResults', required=False)
    parser.add_argument('-v', '--verbose', help='Whether status updates should be displayed. (Default value: No status updates).',
                        action='store_true', default=False, required=False)
    args = parser.parse_args()
    
    inputFile = args.inputFile
    sequenceIdentity = args.percent
    minLength = args.minLen
    maxLength = args.maxLen
    SEG = args.seg
    cores = args.cores
    cullOperationID = args.output
    verboseOutput = args.verbose

    #===========================================================================
    # Validate the user's input.
    #===========================================================================
    toExit = False
    if not os.path.isfile(inputFile):
        print 'The location supplied for the file of input sequences is not a valid file location.'
        toExit = True

    if sequenceIdentity < 5 or sequenceIdentity >= 100:
        print 'The maximum allowable percentage sequence similarity must be no less than 5, and less than 100.'
        toExit = True
    
    if minLength < 0:
        minLength = -1
    if maxLength < 0:
        maxLength = -1
    
    if minLength > maxLength:
        print 'The minimum sequence length must be less than the maximum sequence length.'
        toExit = True
    
    if toExit:
        sys.exit()

    #===========================================================================
    # Perform the culling.
    #===========================================================================

    # Create to directory to store the output in.
    if verboseOutput:
        print 'Creating the output directory.'
    cwd = os.getcwd()
    outputLocation = cwd + '\\' + cullOperationID
    try:
        if os.path.isdir(outputLocation):
            shutil.rmtree(outputLocation)
        elif os.path.exists(outputLocation):
            os.remove(outputLocation)
        os.mkdir(outputLocation)
    except:
        print 'The output directory could not be created. Please check the location specified in  the input parameters.'
        print 'If you did not specify a location then consider changing the default output location (the variable cullOperationID)'
        sys.exit()
    
    # Ensure that the FASTA file input is appropriately formatted.
    if verboseOutput:
        print 'Validating the input file.'
    fileToBLAST = outputLocation + '\\InputCopy.fasta'
    inputFileToLoad = open(inputFile, 'r')
    inputFile = inputFileToLoad.read()
    inputFileToLoad.close()
    errorCode, message = checkfastaformat.main(inputFile, minLength, maxLength)
    if errorCode != 0:
        print message
        sys.exit()
    writeOut = open(fileToBLAST, 'w')
    writeOut.write(message)
    writeOut.close()
    
    # Perform the BLASTing.
    if verboseOutput:
        print 'Starting the BLASTing.'
    similarities = performBLAST.main(fileToBLAST, fileToBLAST, cullOperationID + '\\BLASTOutput', SEG, cores, verboseOutput=verboseOutput)
    
    # Create the sparsematrix of the protein similarity graph.
    if verboseOutput:
        print 'Creating the adjacency matrix'
    adjList, protNames = adjlistcreation.user_seq_main(similarities, sequenceIdentity)
    
    # Choose which proteins to remove from the similarity graph.
    if verboseOutput:
        print 'Performing the culling.'
    if protNames == []:
        # This is True if there are no similarities greater than the given percentage sequence identity. If there are no
        # proteins that are too similar, then there is no need to cull any proteins from the network.
        proteinsToCull = []
    else:
        proteinsToCull, proteinsToKeep = Leafcull.main(adjList, protNames)
    
    if verboseOutput:
        print 'Writing out the results.'
    
    # Write out the proteins that were removed.
    writeOutRem = open(outputLocation + '\\Removed.txt', 'w')
    for i in proteinsToCull:
        writeOutRem.write(i + '\n')
    writeOutRem.close()
    
    # Write out a FASTA file of the proteins kept.
    writeOutKeepFasta = open(outputLocation + '\\KeptFasta.fasta', 'w')
    writeOutKeepList = open(outputLocation + '\\KeptList.txt', 'w')
    writeOutKeepList.write('IDs\tLength\n')
    readFasta = open(fileToBLAST, 'r')
    recording = False
    uniqueProteins = []  # Used to ensure no duplicates get through.
    for line in readFasta:
        if line[0] == '>' and not line[1:-1] in proteinsToCull and not line in uniqueProteins:
            # If the line starts a new protein definition, and that protein is one of the ones to keep.
            recording = True
            uniqueProteins.append(line)
            writeOutKeepFasta.write(line)
            writeOutKeepList.write(line[1:-1])
        elif line[0] == '>':
            # If the line start a new protein definition, but the protein is not one of the ones to keep.
            recording = False
        else:
            # Otherwise the line is a protein sequence.
            if recording:
                # If we are currently working on a protein that is being kept.
                writeOutKeepFasta.write(line)
                writeOutKeepList.write('\t' + str(len(line[:-1])) + '\n')
    readFasta.close()
    writeOutKeepList.close()
    writeOutKeepFasta.close()
Exemple #3
0
def main():
    """Runs the protein culling.

    :param args:    The command line arguments.
    :type args:     list

    """

    #===========================================================================
    # Parse the user's input.
    #===========================================================================
    parser = argparse.ArgumentParser(description=('Generate a non-redundant dataset of sequences from a FASTA file of input sequences. ' +
                                                  'Please see the README for more information on how to use this program.'),
                                     epilog=('This program is designed to cull a dataset of protein sequences so that no ' +
                                             'two sequences have a sequence identity greater than the specified threshold ' +
                                             'percentage. The method used is the Leaf heuristic, which is described in a paper located at ' +
                                             'http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0055484.' +
                                             'A server to perform the culling can be found at http://leaf-protein-culling.appspot.com/.')
                                     )
    parser.add_argument('inputFile', help='The location of the input FASTA file.')
    parser.add_argument('-p', '--percent', help='The maximum percent sequence identity between sequences 5 <= maxPercent < 100 must be true. (Required type: %(type)s, default value: %(default)s).',
                        metavar="maxPercent", type=float, default=20, required=False)
    parser.add_argument('-m', '--minLen', help='The maximum sequence length permissible. A negative value means not to use a minimum sequence length. Must not be greater than the maximum sequence length. (Required type: %(type)s, default value: Not Used).',
                        metavar="minLength", type=int, required=False, default=-1)
    parser.add_argument('-a', '--maxLen', help='The minimum sequence length permissible A negative value means not to use a maximum sequence length. Must not be less than the minimum sequence length. (Required type: %(type)s, default value: Not Used).',
                        metavar="maxLength", type=int, required=False, default=-1)
    parser.add_argument('-c', '--cores', help='The number of processor cores to use for BLASTing. (Required type: %(type)s, default value: %(default)s).',
                        metavar="cores", type=int, default=2, required=False)
    parser.add_argument('-o', '--output', help='The name of the output directory to create in the current working directory. (Required type: %(type)s, default value: a directory called %(default)s in the current working directory).',
                        metavar="outputFolder", type=str, default='CullResults', required=False)
    parser.add_argument('-v', '--verbose', help='Whether status updates should be displayed. (Default value: No status updates).',
                        action='store_true', default=False, required=False)
    args = parser.parse_args()

    inputFile = args.inputFile
    sequenceIdentity = args.percent
    minLength = args.minLen
    maxLength = args.maxLen
    cores = args.cores
    cullOperationID = args.output
    verboseOutput = args.verbose

    #===========================================================================
    # Validate the user's input.
    #===========================================================================
    toExit = False
    if not os.path.isfile(inputFile):
        print('The location supplied for the file of input sequences is not a valid file location.')
        toExit = True

    if sequenceIdentity < 5 or sequenceIdentity >= 100:
        print('The maximum allowable percentage sequence similarity must be no less than 5, and less than 100.')
        toExit = True

    if minLength < 0:
        minLength = -1
    if maxLength < 0:
        maxLength = -1

    if minLength > maxLength:
        print('The minimum sequence length must be less than the maximum sequence length.')
        toExit = True

    if toExit:
        sys.exit()

    #===========================================================================
    # Perform the culling.
    #===========================================================================

    # Create the directory to store the output in.
    if verboseOutput:
        print('Creating the output directory.')
    cwd = os.getcwd()
    if cullOperationID == 'CullResults':
        outputLocation = cwd + '/' + cullOperationID
    else:
        outputLocation = cullOperationID
    try:
        if os.path.isdir(outputLocation):
            shutil.rmtree(outputLocation)
        elif os.path.exists(outputLocation):
            os.remove(outputLocation)
        os.mkdir(outputLocation)
    except:
        print('The output directory could not be created. Please check the location specified in  the input parameters.')
        print('If you did not specify a location then consider changing the default output location (the variable cullOperationID)')
        sys.exit()

    # Ensure that the FASTA file input is appropriately formatted.
    if verboseOutput:
        print('Validating the input file.')
    fileToBLAST = outputLocation + '/InputCopy.fasta'
    inputFileToLoad = open(inputFile, 'r')
    inputFile = inputFileToLoad.read()
    inputFileToLoad.close()
    errorCode, message = checkfastaformat.main(inputFile, minLength, maxLength)
    if errorCode != 0:
        print(message)
        sys.exit()
    writeOut = open(fileToBLAST, 'w')
    writeOut.write(message)
    writeOut.close()

    # Perform the BLASTing.
    similarities = performBLAST.main(fileToBLAST, outputLocation + '/BLASTOutput', cores, verboseOutput=verboseOutput)

    # Create the adjacency matrix of the protein similarity graph.
    if verboseOutput:
        print('Creating the adjacency matrix')
    adjList = {}
    for i in similarities:
        chainA = i[0]
        chainB = i[1]
        seqIden = similarities[i]
        if seqIden >= sequenceIdentity:
            # The sequences are too similar.
            if chainA in adjList:
                adjList[chainA].add(chainB)
            else:
                adjList[chainA] = set([chainB])
            if chainB in adjList:
                adjList[chainB].add(chainA)
            else:
                adjList[chainB] = set([chainA])

    # Choose which proteins to remove from the similarity graph.
    if verboseOutput:
        print('Performing the culling.')
    proteinsToCull = Leafcull.main(adjList)

    if verboseOutput:
        print('Writing out the results.')

    # Write out the proteins that were removed.
    writeOutRem = open(outputLocation + '/Removed.txt', 'w')
    for i in proteinsToCull:
        writeOutRem.write(i + '\n')
    writeOutRem.close()

    # Write out a FASTA file of the proteins kept.
    writeOutKeepFasta = open(outputLocation + '/KeptFasta.fasta', 'w')
    writeOutKeepList = open(outputLocation + '/KeptList.txt', 'w')
    writeOutKeepList.write('IDs\tLength\n')
    readFasta = open(fileToBLAST, 'r')
    recording = False
    uniqueProteins = []  # Used to ensure no duplicates get through.
    for line in readFasta:
        if line[0] == '>':
            notInToCull = len([i for i in proteinsToCull if line[1 : len(i) + 1] == i]) == 0
            if notInToCull and not line in uniqueProteins:
                # If the line starts a new protein definition, and that protein is one of the ones to keep.
                recording = True
                uniqueProteins.append(line)
                writeOutKeepFasta.write(line)
                writeOutKeepList.write(line[1:-1])
            else:
                # If the line start a new protein definition, but the protein is not one of the ones to keep.
                recording = False
        else:
            # Otherwise the line is a protein sequence.
            if recording:
                # If we are currently working on a protein that is being kept.
                writeOutKeepFasta.write(line)
                writeOutKeepList.write('\t' + str(len(line[:-1])) + '\n')
    readFasta.close()
    writeOutKeepList.close()
    writeOutKeepFasta.close()