Esempio n. 1
0
    def annotateInputSequence(self):
        logging.debug('Annotating Input Sequence')
        try:
            self.disableGUI()
            self.update()

            # Popup.  This uses NMDP BTM ACT tool to annotate sequences.
            if (messagebox.askyesno(
                    'Annotate Sequence?',
                    'This will annotate your sequence using the\n' +
                    'NMDP: BeTheMatch Gene Feature\n' +
                    'Enumeration / Allele Calling Tool.\n\n' +
                    'Do you want to continue?')):

                roughNucleotideSequence = collectAndValidateRoughSequence(
                    self.featureInputGuiObject.get('1.0', 'end'))
                currentSubmission = self.submissionBatch.submissionBatch[
                    self.submissionIndex]
                currentSubmission.submittedAllele.rawSequence = roughNucleotideSequence
                currentSubmission.submittedAllele.annotateSequenceUsingService(
                    rawRequestURL=getConfigurationValue(
                        'nmdp_act_rest_address'))
                self.overwriteSequenceText(
                    currentSubmission.submittedAllele.getAnnotatedSequence(
                        includeLineBreaks=True))

            self.update()
            self.enableGUI()

        except Exception:
            showInfoBox('Error Annotating Input Sequence.', str(exc_info()))
            self.update()
            self.enableGUI()
            raise
Esempio n. 2
0
    def howToUse(self):
        # This method should popup some instruction text in a wee window.
        # This should be explicit on how to use the tool.
        logging.error(
            'howToUse() is probably outdated. Check if it needs updating.')

        showInfoBox(
            'How to use this tool',
            'This software is to be used to create an\n' +
            'ENA-formatted submission document,\n' +
            'which specifies a (novel) HLA allele.\n\n' +
            'This tool requires you to submit a\n' +
            'full length HLA allele, including\n' + '5\' and 3\' UTRs.\n\n' +
            'To create & submit an EMBL-ENA submission:\n\n' +
            '1.) Paste a full-length HLA sequence in\n' +
            'the Annotated Sequence text area.\n' +
            '2.) Push [Submission Options] and provide\n' +
            'the necessary sequence metadata.\n' +
            '3.) Push [Annotate Exons & Introns] to\n' +
            'annotate your exons automatically.\n' +
            '4.) Push [Generate an EMBL-ENA submission]\n' +
            'button to generate a submission.\n' +
            '5.) Push [Upload Submission to EMBL-ENA]\n' +
            'to submit the sequence\n' + 'using ENA Webin REST interface\n\n' +
            'If exon annotation is not available,\n' +
            'it may be necessary to annotate manually.\n\n' +
            'Sequences should follow this pattern:\n' +
            '5\'utr EX1 int1 EX2 ... EX{X} 3\'utr\n\n' +
            'Use capital letters for exons,\n' +
            'lowercase for introns & UTRs.\n\n' +
            'Push the "Example Sequence" button to see\n' +
            'an example of a formatted sequence.\n\n' +
            'More information available\n' + 'on the MUMC Github Page:\n' +
            'https://github.com/transplantation-\n' +
            'immunology-maastricht/saddle-bags')
Esempio n. 3
0
    def fetchAnnotationJson(self, rawRequestURL=None):
        try:
            postData = {'sequence': self.rawSequence}

            # Using configuration here causes circular dependency. So I'll just pass it in.
            if(rawRequestURL is None):
                logging.error('You must pass a rawRequestURL to fetchAnnotationJson.')
                return
            else:
                requestURL = rawRequestURL + '?' + urlencode(postData)

            resultsIoObject = BytesIO()

            curlObject = Curl()
            curlObject.setopt(curlObject.URL, requestURL)
            curlObject.setopt(curlObject.WRITEDATA, resultsIoObject)

            curlObject.perform()
            curlObject.close()

            getBody = resultsIoObject.getvalue().decode('utf8')

            logging.debug('JSON Request Body:\n' + getBody)

            # TODO:
            # Detect error <head><title>414 Request-URI Too Large</title></head>
            # For larger DRB alleles the webserver fails.
            # Detect error if the result is not json.
            # Maybe this error detection happens in parseExons. But i maybe need to detect server errors here.
            # Simple case is an empty string.
            if(getBody is None or len(getBody)<1):
                logging.error('The JSON results were an empty string. Is there a problem with the ACT server?:' + str(requestURL))
                showInfoBox('Problem Accessing Annotation Service','The JSON results were an empty string. Is there a problem with the ACT server?')
                return None

            # If it's an html error we can respond nicely.
            if(getBody[0:5]=='<html>'):
                # TODO: this might not work if i get some other kind of html.
                errorCode = getBody[getBody.find('<title>'):getBody.find('</title>')]
                logging.error('The annotation JSON results are html, this probably indicates an issue with the annotation webserver:\n' + str(requestURL))
                showInfoBox('Problem Accessing Annotation Service', 'The annotation results are HTML, not JSON, probably an issue with the ACT webserver:\n' + str(errorCode))
                return None

            return getBody

        except Exception:
            logging.error('Exception when performing CURL:\n')
            logging.error(str(exc_info()))
            logging.error('URL:' + str(requestURL))

            raise
Esempio n. 4
0
    def deleteCurrentSubmission(self):
        logging.debug('deleteCurrentSubmission pressed')
        self.saveCurrentSubmission()

        if (len(self.submissionBatch.submissionBatch) == 1):
            showInfoBox(
                'Cannot delete last submission.',
                'You cannot delete the last remaining submission in the batch.'
            )
        else:
            del self.submissionBatch.submissionBatch[self.submissionIndex]

            # If that was the rightmost in the batch, we need to reduce the index
            if ((self.submissionIndex) >= len(
                    self.submissionBatch.submissionBatch)):
                self.submissionIndex = self.submissionIndex - 1

            self.loadCurrentSubmission()
Esempio n. 5
0
    def constructSubmission(self):
        # Gather sequence information from the input elements, and generate a text ENA submission.
        logging.debug('Constructing Submission')
        try:
            currentSubmission = self.submissionBatch.submissionBatch[
                self.submissionIndex]

            # TODO: What happens when the sequence is not annotated yet? Probably need to remove this logic. Annoying.
            # if (isSequenceAlreadyAnnotated(roughNucleotideSequence)):
            #     annotatedSequence = roughNucleotideSequence
            #
            # else:
            #
            #     if (messagebox.askyesno('Auto - Annotate Exons?'
            #         , 'It looks like your sequence features have not been identified.\n' +
            #         'Would you like to annotate using NMDP: BeTheMatch\n' +
            #         'Gene Feature Enumeration Tool?')):
            #
            #         self.annotateInputSequence()
            #         annotatedSequence = collectRoughSequence(self.featureInputGuiObject)
            #     else:
            #         # You chose not to annotate.  Hope this works out for you.
            #         annotatedSequence = roughNucleotideSequence

            allGen = EnaSubGenerator()
            allGen.submission = currentSubmission
            allGen.submissionBatch = self.submissionBatch
            enaSubmissionText = allGen.buildENASubmission()

            if (enaSubmissionText is None or len(enaSubmissionText) < 1):
                #showInfoBox('Empty submission text'
                #    ,'You are missing some required information.\n'
                #    + 'Try the \'Submission Options\' button.\n')
                logging.warning('Submission text is empty.')

                self.overwriteSubmissionText('')
            else:
                self.overwriteSubmissionText(enaSubmissionText)

        except KeyError:
            showInfoBox(
                'Missing Submission Options',
                'You are missing some required information.\n' +
                'Use the \'Submission Options\' button.\n' + 'Missing Data: ' +
                str(exc_info()))

        except HlaSequenceException:
            showInfoBox('I see a problem with Sequence Format.',
                        str(exc_info()))

        except Exception:
            showInfoBox('Error Constructing Submission.', str(exc_info()))
            raise
Esempio n. 6
0
def checkPrerequisites():
    logging.debug('Checking for prerequisites')

    # Do we have Java?
    # That's a complicated question. Gotta deal with lots of stuff to check that in windows, inside pyinstaller.
    try:
        # Necessary nonsense for calling command in windows.
        # Should probably move this to a common method, do that when i see a bug in embl jar file submissions.
        # True if windows:
        if hasattr(subprocess, 'STARTUPINFO'):
            logging.debug('This is Windows.')
            # On Windows, this should avoid popping up a console window, when run in --noconsole mode.
            startupInfo = subprocess.STARTUPINFO()
            startupInfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
            # Pass cmd an environment so Windows will search the path variables.
            environVars = environ
            # Use an intermediate shell to launch the process? Yes, in Windows.
            useShell = True
        else:
            logging.debug('This is not Windows.')
            # we don't need these variables in linux.
            startupInfo = None
            environVars = None
            useShell = False

        # Set up some arguments for check_output
        processArgs = {
            'stdin': PIPE,
            'stderr': STDOUT,
            'startupinfo': startupInfo,
            'env': environVars,
            'universal_newlines': True,
            'shell': useShell
        }

        txt = check_output(['java', '-version'], **processArgs)
        javaVersionOutput = str(txt)
        logging.debug('Java Version Output: ' + str(javaVersionOutput))

        # Filter out the java version. If it's there, then great.
        regexPattern = '\"(\d+\.\d+).*\"'
        javaVersion = search(regexPattern, javaVersionOutput).groups()[0]
        logging.debug('Java Version: ' + str(javaVersion))

        if (len(str(javaVersion)) < 2):
            showInfoBox(
                'Missing Java',
                'Warning.\nJava version\nwas not found.\nPerhaps java is missing?'
            )

        # logging.debug('Java version output:\n' + javaVersionOutput)
    except Exception as e:
        showInfoBox(
            'Missing Java',
            'Warning.\nJava version\nwas not found.\nPerhaps java is missing?')
        # logging.debug ('Unexpected problem during execution:')
        logging.error('Java version was not found. Perhaps java is missing?')
        logging.debug(exc_info()[1])
        logging.debug(str(e))

    # Do i have the EMBL Commandline Jar file?
    jarFileLocation = findJarFile()
    if (isfile(jarFileLocation)):
        logging.debug('Using this EMBL Jar file:' + str(jarFileLocation))
    else:
        logging.error('This does not appear to be a valid jar file:' +
                      str(jarFileLocation))
        showInfoBox(
            'Missing Jar File',
            'Warning.\nEMBL Commandline Jar File\nwas not found:\n' +
            str(jarFileLocation))
Esempio n. 7
0
    def identifyFeaturesFromJson(self, sequenceAnnotationJson):
        # This method parses the Json text from the ACT service, and identifies the genomic features.
        # It performs some sanity checks and then sets the according features in this HlaGene object.

        # The json should be a String, but it is returned from the NMDP ACT API as a "Typing" object. I convert it to a String to make everyone happy.
        jsonString = str(sequenceAnnotationJson)
        if(jsonString is not None and len(jsonString) > 1):

            try:
                self.features = []
                fivePrimeSequence = ''
                threePrimeSequence = ''



                #logging.debug('THIS IS THE JSON STRING\n:' + jsonString)
                parsedJson = loads(jsonString)

                if (len(parsedJson.keys()) > 1):
                    # Loop through the recognized Features
                    if 'features' in parsedJson.keys():
                        # We found features.
                        featureList = parsedJson['features']
                        logging.info('I found this many Known Features:' + str(len(featureList)))

                        for featureDictionary in featureList:

                            term = str(featureDictionary['term'])
                            rank = str(featureDictionary['rank'])
                            sequence = str(featureDictionary['sequence'])

                            logging.debug('Known Feature'
                                          + ':' + term
                                          + ':' + rank
                                          + ':' + sequence)

                            currentFeature = GeneFeature()

                            if (term == 'five_prime_UTR'):
                                currentFeature.sequence = sequence.lower()
                                currentFeature.exon = False
                                fivePrimeSequence = currentFeature.sequence
                            elif (term == 'three_prime_UTR'):
                                currentFeature.sequence = sequence.lower()
                                currentFeature.exon = False
                                threePrimeSequence = currentFeature.sequence
                            elif (term == 'exon'):
                                currentFeature.sequence = sequence.upper()
                                currentFeature.exon = True
                            elif (term == 'intron'):
                                currentFeature.sequence = sequence.lower()
                                currentFeature.exon = False
                            else:
                                raise Exception('Unknown Feature Term, expected exon or intron:' + term)

                            self.features.append(currentFeature)

                    else:
                        raise Exception('Unable to identify any HLA exon features, unable to annotate sequence.')

                    if (len(fivePrimeSequence) < 1):
                        logging.warning('I cannot find a five prime UTR.')
                        logging.info('Rough Sequence:\n' + cleanSequence(self.rawSequence).upper())
                        logging.info('Annotated Sequence:\n' + cleanSequence(annotatedSequence).upper())
                        raise Exception('GFE service did not find a 5\' UTR sequence. You will need to annotate the genomic features manually.')

                    elif cleanSequence(fivePrimeSequence).upper() in cleanSequence(self.rawSequence).upper():
                        # What if the reported 5' UTR is less than what is returned by GFE?
                        # TODO: I don't know if this code is working. Hard to debug.
                        beginIndex = cleanSequence(self.rawSequence).upper().find(cleanSequence(fivePrimeSequence).upper())
                        endIndex = beginIndex + len(fivePrimeSequence)
                        logging.info('GFE sequence exists in rough sequence, at index: (' + str(beginIndex) + ':' + str(endIndex) + ')')
                        logging.info('previous fivePrime Sequence=\n' + fivePrimeSequence)
                        fivePrimeSequence = cleanSequence(self.rawSequence)[0:endIndex].lower()
                        logging.info('new fivePrime Sequence=\n' + fivePrimeSequence)

                    self.nameAnnotatedFeatures()

                    # Final check: Do the annotated sequence and rough sequence match?
                    if (cleanSequence(self.getAnnotatedSequence(includeLineBreaks=False)).upper() == cleanSequence(self.rawSequence).upper()):
                        logging.info('Successful annotation.')
                        pass

                    else:
                        logging.error('Rough Sequence:\n' + cleanSequence(self.rawSequence).upper())
                        logging.error('Annotated Sequence:\n' + cleanSequence(self.getAnnotatedSequence(includeLineBreaks=True)).upper())
                        raise Exception('Annotated sequence and rough sequence do not match. Something went wrong in Annotation.')

                else:
                    raise Exception('No keys found in the JSON Dictionary, unable to annotate sequence.')

            except Exception:
                logging.error(str((exc_info())))
                showInfoBox('Exon Parsing Error', 'I had trouble annotating your sequence:\n' + str(exc_info()) + '. You will have to annotate manually.')
        else:
            logging.error('JSON Parse is empty.')
Esempio n. 8
0
def translateSequence(submission):
    # This is a short wrapper method to use biopython's translation method.
    # Most of this code is just checking for things that went wrong
    # TODO: I'll need to fix the imports for these biopython methods. Don't worry, it'll break when the time comes.
    # TODO: This method should be a class method of AlleleSubmission. Move it there.
    inputSequence = submission.submittedAllele.getExonSequence()
    proteinSequence = ''
    alleleLocalName = submission.localAlleleName

    try:
        # Do nothing if the input sequence is blank.
        if (len(inputSequence) > 0):

            coding_dna = Seq(inputSequence, generic_dna)
            proteinSequence = str(coding_dna.translate())
            logging.debug('Translating allele:' + alleleLocalName)
            logging.debug('Exon Sequence before translation:' + coding_dna)
            logging.debug('Translated Protein:' + proteinSequence)

            # Perform Sanity Checks.
            # Stop codon *should* be at the end of the protein.
            # Here we seek out the first instance of a stop codon,
            # and remove the peptides afterwards.
            # because that's what happens in real life.
            stopCodonLocation = proteinSequence.find('*')

            # If no stop codon was found
            if (stopCodonLocation == -1):
                submission.isPseudoGene = True
                # assignConfigurationValue('is_pseudo_gene','1')
                logging.info('No Stop Codon found. This is a "pseudo-gene".')
                # If multiple of three (correct codon length)
                if (len(coding_dna) % 3 == 0):
                    showInfoBox('No Stop Codon Found',
                                        'The translated protein does not contain a stop codon.\n' +
                                        'This is indicated by a /pseudo flag in the sequence submission.'
                                        )

                # Wrong Codon Length
                else:
                    showInfoBox('No Stop Codon Found',
                                        'The translated protein does not contain a stop codon.\n' +
                                        'The coding nucleotide sequence length (' + str(
                                            len(coding_dna)) + ') is not a multiple of 3.\n' +
                                        'This is indicated by a /pseudo flag in the sequence submission.')

            # If Stop Codon is in the end of the protein (This is expected and correct)
            elif (stopCodonLocation == len(proteinSequence) - 1):
                submission.isPseudoGene = False
                # assignConfigurationValue('is_pseudo_gene','0')

                # If multiple of three (correct codon length)
                if (len(coding_dna) % 3 == 0):
                    # Everything is fine in this case.  Trim off the stop codon
                    logging.info('The stop codon is in the correct position. This is not a "pseudo-gene".')
                    proteinSequence = proteinSequence[0:stopCodonLocation]
                    pass
                    # Wrong Codon Length
                else:
                    logging.info(
                        'The stop codon is in the correct position, but there are extra nucleotides. This is not a "pseudo-gene".')
                    showInfoBox('Extra Nucleotides After the Stop Codon',
                                        'The stop codon is at the correct position in the protein, but ' +
                                        'The coding nucleotide sequence length (' + str(
                                            len(coding_dna)) + ') is not a multiple of 3.\n\n' +
                                        'Please double check your sequence.')
                    proteinSequence = proteinSequence[0:stopCodonLocation]

            # Else Stop Codon is premature (before the end of the protein)
            else:
                logging.info('A premature stop codon was found. This is a "pseudo-gene".')
                submission.isPseudoGene = True
                # assignConfigurationValue('is_pseudo_gene','1')

                # If multiple of three (correct codon length)
                if (len(coding_dna) % 3 == 0):
                    showInfoBox('Premature Stop Codon Detected',
                                        'Premature stop codon found:\nProtein Position (' +
                                        str(stopCodonLocation + 1) + '/' +
                                        str(len(proteinSequence)) + ')\n\n' +
                                        'This is indicated by a /pseudo flag in the sequence submission.\n' +
                                        'Double check your protein sequence,\n' +
                                        'this might indicate a missense mutation.\n\n' +
                                        'Translated Protein:\n' + proteinSequence +
                                        '\n\nProtein in ENA Submission:\n' + proteinSequence[0:stopCodonLocation] +
                                        '\n'
                                        )
                    proteinSequence = proteinSequence[0:stopCodonLocation]


                # Wrong Codon Length
                else:
                    showInfoBox('Premature Stop Codon Detected',
                                        'Premature stop codon found:\nProtein Position (' +
                                        str(stopCodonLocation + 1) + '/' +
                                        str(len(proteinSequence)) + ')\n\n' +
                                        'This is indicated by a /pseudo flag in the sequence submission.\n' +
                                        'Nucleotide count is not a multiple of 3,\n' +
                                        'Double check your protein sequence,\n' +
                                        'this might indicate a missense mutation.\n\n' +
                                        'Translated Protein:\n' + proteinSequence +
                                        '\n\nProtein in ENA Submission:\n' + proteinSequence[0:stopCodonLocation] +
                                        '\n'
                                        )
                    proteinSequence = proteinSequence[0:stopCodonLocation]
        else:
            logging.warning('Translating a nucleotide sequence of length 0. Done. That was easy.')
            pass

        return proteinSequence

    except Exception:
        logging.error('Problem when translating protein:')
        logging.error(str(exc_info()))
        showInfoBox('Protein Translation Error','I could not translate your protein:\n' + str(exc_info()))

        raise
Esempio n. 9
0
def collectAndValidateRoughSequence(roughNucleotideSequence):
    # This method can clean up a sequence input.
    # Should work for fasta and fastq inputs. XML in the future???
    try:
        cleanedSequence = None

        # Is this sequence in Fasta format?
        try:
            logging.debug('Checking if sequence is fasta format.')

            fileHandleObject = StringIO(roughNucleotideSequence)
            fastaSeqList = list(SeqIO.parse(fileHandleObject, 'fasta'))
            logging.debug('The length of the fasta seq list is:' + str(len(fastaSeqList)))
            if (len(fastaSeqList) == 1):
                cleanedSequence = cleanSequence(str(fastaSeqList[0].seq))
                logging.debug('The input sequence is in .fasta format.')
            else:
                logging.debug('This sequence is not in .fasta format.')
        except Exception:
            logging.error('Exception when determining file type: ' + str(exc_info()))

        # Is this sequence in Fastq format?
        try:
            logging.debug('Checking if sequence is fastq format.')
            fileHandleObject = StringIO(roughNucleotideSequence)
            fastqSeqList = list(SeqIO.parse(fileHandleObject, 'fastq'))
            logging.debug('The length of the fasta seq list is:' + str(len(fastaSeqList)))
            if (len(fastqSeqList) == 1):
                cleanedSequence = cleanSequence(str(fastqSeqList[0].seq))
                logging.debug('The input sequence is in .fastq format.')
            else:
                logging.debug('This sequence is not in .fastq format.')
        except Exception:
            logging.error('Exception when determining file type: ' + str(exc_info()))

        # TODO: If this file is xml what should we do?  Just give up i suppose.
        # TODO: I could warn the user that XML isn't supported yet...
        # We want to accept HML.  But there are too many xml formats.
        # Yeah I dunno about HML, we will not implement that right now.

        # If we haven't found an annotated sequence yet, this is not fasta or fastq.
        if (cleanedSequence is None):
            cleanedSequence = cleanSequence(roughNucleotideSequence)

        # Are we using any nonstandard / ambiguous nucleotides?
        for nucleotideCharacter in cleanedSequence:
            if (nucleotideCharacter not in ('A', 'G', 'C', 'T', 'a', 'g', 'c', 't')):
                showInfoBox('Nonstandard Nucleotide'
                                     , 'I found a non-standard\n'
                                     + 'character in your nucleotide\n'
                                     + 'sequence: '
                                     + str(nucleotideCharacter) + '\n'
                                     + 'You should use standard nucleotide\n'
                                     + 'characters in your submission.\n'
                                     + 'I will attempt to continue.')
                break

        return cleanedSequence

    except Exception:
        # except Exception, e:
        showInfoBox('Error Reading Input Sequence.'
                             , str(exc_info()))
        raise
Esempio n. 10
0
def validateAndSubmit(submission, submissionBatch, workingDirectory,
                      dateTimeNow):
    logging.info('Validating and Submitting Files.')

    # Find the webin cli application
    webinJarLocation = findJarFile()
    logging.info('Webin Jar file found here:' + str(webinJarLocation))

    # Construct the cli Parameters (for both validation and submission)
    # <outputDir> can be specified using the -outputDir option, the
    # <context> is specified using the -context option, and the
    # <name> is a submitter provided unique name specified in the manifest file.
    # manifest file is specified using the -manifest <filename>
    # -userName=USER, -password=PASSWORD]
    outputDir = join(workingDirectory, 'SubmissionOutput')
    if not isdir(outputDir):
        makedirs(outputDir)
    manifestShortFileName = 'manifest_' + dateTimeNow + '.txt'
    manifestFileName = join(workingDirectory, manifestShortFileName)

    # TODO: they list an option to use a proxy. Maybe I need to use a proxy at some point, look at the ENA webin instructions. https://ena-docs.readthedocs.io/en/latest/general-guide/webin-cli.html
    # TODO: I'm not super familiar with subprocess.call. I create an array of parameters. Will it handle when my paths have a space? I hope it's smart enough to do that.
    # Call returns the error code. Check_output returns the text output of the command. In this case the error code is valuable.
    validateCommand = [
        'java', '-jar',
        str(webinJarLocation), '-validate', '-outputDir', outputDir,
        '-context', 'sequence', '-manifest', manifestFileName, '-userName',
        submissionBatch.enaUserName, '-password', submissionBatch.enaPassword
    ]
    submitCommand = [
        'java', '-jar',
        str(webinJarLocation), '-submit', '-outputDir', outputDir, '-context',
        'sequence', '-manifest', manifestFileName, '-userName',
        submissionBatch.enaUserName, '-password', submissionBatch.enaPassword
    ]

    if (int(getConfigurationValue('test_submission')) == 1):
        validateCommand.append('-test')
        submitCommand.append('-test')

    # TODO: This puts a password in the log file. Is that okay?
    logging.debug('validate webin command:' + str(validateCommand))
    logging.debug('submit webin command:' + str(submitCommand))

    # Validate Sequence
    #jarSubmissionResults = call(validateCommand)
    jarSubmissionResults = call(submitCommand)

    #if(str(jarSubmissionResults) != '1'):
    #    logging.error('Error executing the .jar file to submit sequence to ENA.')
    #    showInfoBox('Error executing jar file', 'Error executing the .jar file to submit sequence to ENA. Not sure what the problem is.')

    # Parse the Result Files. Might need an analysis accession but not sure.
    # Manifest Result file lists some errors, such as incorrect password. Tell the user to check this file if there are problems.
    # manifestResultFile = '/home/ben/saddlebags/submission_temp/SubmissionOutput/manifest_2019_06_27_17_08_50_626831.txt.report'
    # CLI report file has just a log of the commandline tool. It also lists the analysis accession #
    #cliReportFile = '/home/ben/saddlebags/submission_temp/SubmissionOutput/webin-cli.report'
    # the analysis reciept / result file has the analysis accession, submission accession, and messages.
    #analysisResultFile= '/home/ben/saddlebags/submission_temp/SubmissionOutput/sequence/HLA-DRA_MUMC_1/submit/receipt.xml'

    analysisResultFileLocation = join(
        join(
            join(
                join(outputDir, 'sequence')
                # TODO: the webin commandline tool places things in a folder name, but gets rid of special characters.
                # TODO: For HLA, that means * and : characters.
                ,
                submission.localAlleleName.replace('*', '_').replace(':',
                                                                     '_')),
            'submit'),
        'receipt.xml')
    analysisResultFile = open(analysisResultFileLocation, 'r')
    analysisResultText = analysisResultFile.read()

    (analysisSubmissionSuccess, analysisAccessionNumber, analysisErrorMessages
     ) = interpretAnalysisSubmissionResults(analysisResultText)
    if (analysisSubmissionSuccess):
        # Great. The analysis was created successfully.
        showInfoBox(
            'Successful Submission.',
            'Successful submission. ' + str(submission.localAlleleName) +
            ' has analysis Accession number is:' +
            str(analysisAccessionNumber))
        pass
    else:
        messageText = ('There was a problem in the Analysis Submission.\n' +
                       'I cannot continue.\n' +
                       'These messages were reported by ENA:\n')
        for errorMessage in analysisErrorMessages:
            messageText += ('\n' + errorMessage + '\n')
        showInfoBox('Cannot Submit Analysis XML via REST', messageText)
        logging.error('Failure to submit analysis submission file:' +
                      str(exc_info()[1]) + '\n')
        return
Esempio n. 11
0
def prepareSubmissionFiles(submission, submissionBatch, workingDirectory,
                           dateTimeNow):
    logging.info('Preparing Submission Files')

    submissionShortFileName = 'HLA_Submission_' + dateTimeNow + '.txt'
    submissionFileName = join(workingDirectory, submissionShortFileName)

    zippedShortFileName = submissionShortFileName + '.gz'
    zippedFileName = join(workingDirectory, zippedShortFileName)

    manifestShortFileName = 'manifest_' + dateTimeNow + '.txt'
    manifestFileName = join(workingDirectory, manifestShortFileName)

    if not isdir(workingDirectory):
        makedirs(workingDirectory)

    # Create the submission file
    try:
        outputFileObject = open(submissionFileName, 'w')
        outputFileObject.write(submission.enaSubmissionText)
        logging.debug('Submission Text:\n' + submission.enaSubmissionText)
        outputFileObject.close()

    except Exception:
        logging.error('Cannot Write Submission Flatfile')
        logging.error(exc_info())
        showInfoBox(
            'Cannot Write Submission Flatfile',
            'Sorry, I failed to create the submission file:\n' +
            str(submission.enaSubmissionText) +
            '\n and I cannot continue.\nMaybe this is a ' +
            'permissions issue, are these folders read only?\n' +
            str(exc_info()[1]))
        logging.error('Failure to create submission file:' +
                      str(exc_info()[1]) + '\n')
        return

    logging.info('Submission file was created:\n' + str(submissionFileName) +
                 '\n')

    # gzip the submission file.  Make a gz file.
    try:
        with open(submissionFileName,
                  'rb') as fileIn, gzipOpen(zippedFileName, 'wb') as fileOut:
            copyfileobj(fileIn, fileOut)

    except Exception:
        logging.error('Cannot Compress Submission File')
        logging.error(exc_info())
        showInfoBox(
            'Cannot Compress Submission File',
            'Sorry, I failed to compress the submission file:\n' +
            str(zippedFileName) + '\n and I cannot continue.\n' +
            str(exc_info()[1]))
        logging.error('Failure to create zip file:' + str(exc_info()[1]) +
                      '\n')
        return

    logging.info('Zip file was created:\n' + str(zippedFileName) + '\n')

    # Create the Manifest file, which looks like this:
    # NAME    Novel_HLA_Allele_A
    # STUDY   PRJEB22887
    # FLATFILE    ENA.HLA.Submission_A.txt.gz
    manifestFile = createOutputFile(manifestFileName)
    # I'm using the "allele_name" as the submission name in the manifest file. Is that okay?
    # Maybe it should be the analysis alias...which I am probably deleting.
    # TODO: For batch submissions, i shouldn't be able to get the configuration value.. Refactor for batch submission.
    manifestFile.write('NAME\t' + str(submission.localAlleleName) + '\n')
    manifestFile.write('STUDY\t' + str(submissionBatch.studyAccession) + '\n')
    manifestFile.write('FLATFILE\t' + str(zippedFileName) + '\n')
    manifestFile.close()
Esempio n. 12
0
def registerStudy(submissionBatch, workingDirectory, dateTimeNow):
    logging.info('Registering a Project/Study')
    # effectively, study = project

    # The configuration value comes from a radio button in the configuration GUI. existing study = 1, new study = 2
    # Study = Project, because ENA is always sensible.
    newProject = (str(submissionBatch.chooseStudy) == '2')

    if newProject:

        # Abstract = Description. ENA should have only one name for identical things, that's confusing.
        #popup confirmation for new project.
        if (showYesNoBox(
                'Create New Study',
                'Are you sure you want me to create a new Study to store your submission(s)?'
                + '\nID:\n' + str(submissionBatch.studyId) +
                '\nShort Title:\n' + str(submissionBatch.studyShortTitle) +
                '\nDescription:\n' + str(submissionBatch.studyAbstract))):

            # Generate Project and Project Submission XML Files
            try:
                projectFileName = join(workingDirectory, 'project.xml')
                #createProjectXML(fullXmlFilePath, projectID, studyShortTitle, studyAbstract):
                projectText = createProjectXML(projectFileName,
                                               submissionBatch.studyId,
                                               submissionBatch.studyShortTitle,
                                               submissionBatch.studyAbstract)

                # TODO: All these references to "project" should be changed to "study". The same word is used for both, but
                # "study" is the text on the ENA website. Study is better.
                logging.debug('Study Text:\n' + projectText)

                projectSubmissionFileName = join(workingDirectory,
                                                 'project_submission.xml')
                projectSubmissionText = createProjectSubmissionXML(
                    projectSubmissionFileName, 'proj_sub_' + dateTimeNow,
                    'project.xml')

                logging.debug('Study Submission Text:\n' +
                              projectSubmissionText)

            except Exception:
                logging.error('Cannot Create Project Submission XML')
                logging.error(exc_info())
                showInfoBox(
                    'Cannot Create Project Submission XML',
                    'Sorry, I failed to create a project XML file\nand I cannot continue.\n'
                    + str(exc_info()[1]))
                logging.error('Failure to create project submission file:' +
                              str(exc_info()[1]) + '\n')
                return

            logging.info('Study/Submission XML files were created.\n')

            # Use REST to submit this project
            try:
                # Return value should be a tuple:
                # (Success, ProjectAccession, Messages[])
                (projectSubmissionSuccess, projectAccessionNumber,
                 projectErrorMessages) = performProjectSubmission(
                     projectSubmissionFileName, projectFileName,
                     submissionBatch)

                if (projectSubmissionSuccess):
                    # Great. The project was created successfully.
                    # Lets use this new study accession moving forward. Any future submissions in this batch are submitted to the same project.
                    submissionBatch.studyAccession = projectAccessionNumber
                    submissionBatch.chooseStudy = "1"

                    logging.info(
                        'New study has been uploaded, accession:' +
                        str(submissionBatch.studyAccession) + '\n' +
                        'Subsequent submissions will use this project number.')
                    showInfoBox(
                        'Success!', 'New study has been uploaded, accession:' +
                        str(submissionBatch.studyAccession) + '\n' +
                        'Subsequent submissions will use this project number.')
                else:
                    messageText = (
                        'There was a problem in the Study Submission.\n' +
                        'I cannot continue.\n' +
                        'These messages were reported by ENA:\n')
                    for errorMessage in projectErrorMessages:
                        messageText += ('\n' + errorMessage + '\n')
                    showInfoBox('Cannot Submit Study XML via REST',
                                messageText)
                    logging.error(
                        'Failure to submit project submission file:' +
                        str(exc_info()[1]) + '\n' + messageText + '\n')
                    return

            except Exception:
                logging.error('Cannot Submit Study XML')
                logging.error(exc_info()[1])
                showInfoBox(
                    'Cannot Submit Study XML',
                    'Sorry, I failed to submit the project XML file\nand I cannot continue.\n'
                    + str(exc_info()[1]))
                logging.error('Failure to upload project submission file:' +
                              str(exc_info()[1]) + '\n')
                return

        else:
            logging.error(
                'User asked me to not create a new project. I try to get a new project id from them.'
            )
            submissionBatch.studyAccession = getInfoBox(
                'A Project ID Please.',
                'In that case, please provide a project ID I can use. You can find it on the ENA webin website after you login.\nIt should look like this:PRJEB12345'
            )
            if (submissionBatch.studyAccession is not None
                    and len(submissionBatch.studyAccession) > 3):
                submissionBatch.chooseStudy = '1'
                logging.info(
                    'User gave me a new project number. I will use this one:' +
                    str(submissionBatch.studyAccession))

    # existing project, we will use the supplied accession#. Easy.
    else:
        logging.info('Using existing study accession:' +
                     str(submissionBatch.studyAccession) + '\n')
        pass

    assignConfigurationValue('submission_batch', submissionBatch)
Esempio n. 13
0
def loadConfigurationFile():
    # TODO: should I clear my configuration first? I have a method to purge my globals.
    # I don't know right now, but probably not.
    assignConfigName()

    try:

        if not isfile(globalVariables['config_file_location']):
            logging.info(
                'The config file does not exist yet. I will not load it:\n' +
                globalVariables['config_file_location'])
        else:
            logging.info('The config file already exists, I will load it:\n' +
                         globalVariables['config_file_location'])

            tree = ET.parse(globalVariables['config_file_location'])
            root = tree.getroot()

            for child in root:
                #logging.debug('The child tag is:' + child.tag)

                # If the child node is a submission batch
                if (child.tag == 'submission_batch'):

                    if (child):
                        # If the submission batch has children nodes, start with an empty batch.
                        submissionBatch = SubmissionBatch(False)
                    else:
                        # Otherwise we want to start with a single empty submission in the batch.
                        submissionBatch = SubmissionBatch(True)

                    # Assign some information about this batch of submissions.
                    submissionBatch.enaUserName = deserializeConfigValue(
                        child.attrib['enausername'])
                    submissionBatch.studyAccession = deserializeConfigValue(
                        child.attrib['studyaccession'])
                    submissionBatch.chooseStudy = deserializeConfigValue(
                        child.attrib['choosestudy'])
                    submissionBatch.ipdSubmitterId = deserializeConfigValue(
                        child.attrib['ipdsubmitterid'])
                    submissionBatch.ipdSubmitterName = deserializeConfigValue(
                        child.attrib['ipdsubmittername'])
                    submissionBatch.ipdAltContact = deserializeConfigValue(
                        child.attrib['ipdaltcontact'])
                    submissionBatch.ipdSubmitterEmail = deserializeConfigValue(
                        child.attrib['ipdsubmitteremail'])
                    submissionBatch.labOfOrigin = deserializeConfigValue(
                        child.attrib['laboforigin'])
                    submissionBatch.labContact = deserializeConfigValue(
                        child.attrib['labcontact'])
                    submissionBatch.studyId = deserializeConfigValue(
                        child.attrib['studyid'])
                    submissionBatch.studyShortTitle = deserializeConfigValue(
                        child.attrib['studyshorttitle'])
                    submissionBatch.studyAbstract = deserializeConfigValue(
                        child.attrib['studyabstract'])

                    # Loop the children, they are submission objects. Load up their information.
                    for submissionChild in child:
                        #logging.debug('The submission child tag is:' + submissionChild.tag)
                        #logging.debug('This submission has the text:' + submissionChild.text)
                        # Add a few submissions to this batch.
                        # Submission # 1
                        submission = AlleleSubmission()
                        submission.submittedAllele.rawSequence = submissionChild.text
                        submission.submittedAllele.identifyFeaturesFromFormattedSequence(
                        )
                        submission.submittedAllele.geneLocus = deserializeConfigValue(
                            submissionChild.attrib['genelocus'])
                        submission.localAlleleName = deserializeConfigValue(
                            submissionChild.attrib['localallelename'])
                        submission.submittedAllele.hlaClass = deserializeConfigValue(
                            submissionChild.attrib['class'])
                        submission.closestAlleleWrittenDescription = deserializeConfigValue(
                            submissionChild.
                            attrib['closestallelewrittendescription'])
                        submission.ipdSubmissionIdentifier = deserializeConfigValue(
                            submissionChild.attrib['ipdsubmissionidentifier'])
                        submission.ipdSubmissionVersion = deserializeConfigValue(
                            submissionChild.attrib['ipdsubmissionversion'])
                        submission.enaAccessionIdentifier = deserializeConfigValue(
                            submissionChild.attrib['enaaccessionidentifier'])
                        submission.cellId = deserializeConfigValue(
                            submissionChild.attrib['cellid'])
                        submission.ethnicOrigin = deserializeConfigValue(
                            submissionChild.attrib['ethnicorigin'])
                        submission.sex = deserializeConfigValue(
                            submissionChild.attrib['sex'])
                        submission.consanguineous = deserializeConfigValue(
                            submissionChild.attrib['consanguineous'])
                        submission.homozygous = deserializeConfigValue(
                            submissionChild.attrib['homozygous'])
                        #print ('I am about to read and store my typed alleles.')
                        childElementText = submissionChild.attrib[
                            'typedalleles']
                        #print ('element text:' + childElementText)
                        deserializedText = deserializeConfigValue(
                            childElementText)
                        #print ('deserialized text:' + deserializedText)
                        parsedObject = parseTypedAlleleInput(deserializedText)
                        #print('parsedObject:' + str(parsedObject))
                        submission.typedAlleles = parsedObject
                        #print ('Success.')
                        submission.materialAvailability = deserializeConfigValue(
                            submissionChild.attrib['materialavailability'])
                        submission.cellBank = deserializeConfigValue(
                            submissionChild.attrib['cellbank'])
                        submission.primarySequencingMethodology = deserializeConfigValue(
                            submissionChild.
                            attrib['primarysequencingmethodology'])
                        submission.secondarySequencingMethodology = deserializeConfigValue(
                            submissionChild.
                            attrib['secondarysequencingmethodology'])
                        submission.primerType = deserializeConfigValue(
                            submissionChild.attrib['primertype'])
                        submission.primers = deserializeConfigValue(
                            submissionChild.attrib['primers'])
                        submission.sequencedInIsolation = deserializeConfigValue(
                            submissionChild.attrib['sequencedinisolation'])
                        submission.sequencingDirection = deserializeConfigValue(
                            submissionChild.attrib['sequencingdirection'])
                        submission.numOfReactions = deserializeConfigValue(
                            submissionChild.attrib['numofreactions'])
                        submission.methodComments = deserializeConfigValue(
                            submissionChild.attrib['methodcomments'])
                        submission.citations = deserializeConfigValue(
                            submissionChild.attrib['citations'])
                        submissionBatch.submissionBatch.append(submission)

                    # Store my submission batch in the global variables.
                    assignConfigurationValue('submission_batch',
                                             submissionBatch)

                    logging.debug(
                        'Just loaded the config file and stored the submission batch. Batch is length:'
                        + str(len(submissionBatch.submissionBatch)))

                else:
                    # Any arbitrary configuration value, just store it.
                    assignConfigurationValue(child.tag, child.text)

        # Here is where I assign the common/critical configuration values
        # I do this if the config file already existed, or if it didnt.
        # test_submission indicates if we should use the "test" values.
        # I think I'll use this value for both ENA and IPD submissions, if it applies.
        assignIfNotExists('test_submission', '1')
        # Log levels are defined in the Saddlebags config, and passed into the python logging module.
        assignIfNotExists('logging', 'DEBUG')
        # Placeholder for proxy configuration. Not needed right now but maybe for the future.
        #assignIfNotExists('proxy', None)
        assignIfNotExists(
            'ena_rest_address_test',
            'https://www-test.ebi.ac.uk/ena/submit/drop-box/submit/')
        assignIfNotExists('ena_rest_address_prod',
                          'https://www.ebi.ac.uk/ena/submit/drop-box/submit/')
        assignIfNotExists('nmdp_act_rest_address',
                          'http://act.b12x.org/annotate')
        assignIfNotExists('webin_jar_location', 'webin-cli.jar')
        assignIfNotExists('submission_batch', SubmissionBatch(True))

        writeConfigurationFile()

        # Last step is to initialize the log files. Why is this the last step? initializing log should be first
        # but I need some config values before starting the log.
        initializeLog()
    except:
        logging.error(
            'Error when loading configuration file:' +
            str(globalVariables['config_file_location']) +
            '.\nTry deleting your configuration file and reload Saddlebags.\n'
            + str(exc_info()[1]))
        showInfoBox(
            'Error Loading Configuration',
            'Error when loading configuration file:' +
            str(globalVariables['config_file_location']) +
            '.\nTry deleting your configuration file and reload Saddlebags.\n'
            + str(exc_info()[1]))