Beispiel #1
0
    def download_file(cls, id_, filetype):
        """
        Downloads the gzip file with the correct id and filetype and unzips it
        and transfers its contents into a temporary FASTA file for further
        processing. If no files on the server match, returns a TypeError.

        Args:
            id(str): a WGS project ID, composed of only alphabetics
            filetype(str): the type of file to be found. 'fsa_nt.gz' is the
            default, but there are other options for amino acids and other
            formats
        """
        ftp = FTP('bio-mirror.jp.apan.net')
        ftp.login('anonymous', '*****@*****.**')
        ftp.cwd('pub/biomirror/genbank/wgs')

        filenames = ftp.nlst()
        filename = [s for s in filenames if id_ in s and filetype in s]

        if len(filename) is not 1:
            raise TypeError("No files could be found for download.")
        else:
            ftp.retrbinary(
                'RETR ' + filename[0],
                open(generate_path('tmp/loading.gz'), 'wb').write
            )
            with gzip.open(generate_path('tmp/loading.gz')) as fasta, \
                    open(generate_path('tmp/loading.fasta'), 'wb') as output:
                output.write(fasta.read())
Beispiel #2
0
 def upload_genes(self):
     with open(generate_path(self.filename), "r") as file_:
         data = json.load(file_)
         if self.kind == "virulence_factor":
             self.parse_vf(data)
         elif self.kind == "antimicrobial_resistance":
             self.parse_amr(data)
    def validate(self):
        """Handles the whole sequence validation process. After obtaining the
        results for each check, it determines how the sequence should be
        handled in sequence uploading by modifying the associated
        SequenceMetadata object.
        TODO: refactor this more for clarity and ease of testing?
        """
        self.filter_passing_hits()

        checks = {"number of hits":self.check_hits(),
                  "base pair count":self.check_bp(),
                  "contig count":self.check_contigs(),
                  "characters": self.check_chars(),
                  "checksum":not check_checksum(self.seqdata.checksum)}

        failed_checks = {(k, v) for k, v in checks.iteritems() if v is False}

        if failed_checks:
            """
            replace this with logger, break would be replaced by a raised
            Exception where the Exception would be caught by the
            Sequence_Upload code
            """
            for k, v in failed_checks:
                with open(generate_path("outputs/seq_errors.txt"), "a") as file_:
                    file_.write(
                        '%s failed validation:'
                        'the %s was not valid\n' %(self.seqdata.accession, k)
                    )
            self.seqdata.valid = False
        else:
            self.seqdata.valid = True
    def blastn_commandline(cls):
        """Runs a command line BLAST on the generated FASTA sequence using the
        database composed of 10 E. coli species-specific genomic regions and
        outputs the results into XML format into another file.
        """
        command = generate_path("../../blast/ncbi-blast*/bin/blastn")
        fasta = generate_path("tmp/validate.fasta")
        db = generate_path("data/blast/ValidationDB")
        results = generate_path("tmp/validate.xml")

        subprocess.call(
            '%s -query %s -db %s -outfmt 5 -out %s -best_hit_score_edge 0.05 '
            '-best_hit_overhang 0.1' % (
                command, fasta, db, results
            ), shell=True
        )
    def filter_passing_hits(self):
        """
        Reads the result from the command line BLAST using fileIO and parses it
        to look for the top scoring hits at 90% and above. If there are
        multiple hits, select the highest scoring one.
        """
        self.create_fasta()
        self.blastn_commandline()

        hits = {}
        result_handle = open(generate_path("tmp/validate.xml"))
        for record in NCBIXML.parse(result_handle):
            for entry in record.alignments:
                hit = entry.hit_def
                seqlen = entry.length
                hsp = entry.hsps[0]
                percent_ident = (float(hsp.positives) / float(seqlen)) * 100

                if 90 <= percent_ident <= 100:
                    if hit in hits:
                        if percent_ident > hits[hit]:
                            hits[hit] = percent_ident
                    else:
                        hits[hit] = percent_ident
        del result_handle
        self.seqdata.hits = hits
 def create_fasta(self):
     """Writes a FASTA sequence to a file for use by the command line version of BLAST. Obtains nucleotide data from
     the sequence data object used to initialize the validator and writes each entry as a separate FASTA object.
     Contigs from WGS samples must be kept separate to avoid false matches based on misaligned sequences.
     """
     with open(generate_path("tmp/validate.fasta"), "w") as file_:
         for (accession_name, seq) in self.seqdata.contigs:
             file_.write(">%s\n%s\n" %(self.seqdata.accession, seq))
Beispiel #7
0
    def test_generate_path(self):
        (frame, filepath, line_number, function_name, lines, index) = inspect.stack()[0]
        expected_dir = os.path.dirname(filepath)
        generated = _utils.generate_path("asdf")
        generated_dir = os.path.dirname(generated)

        self.assertEqual(expected_dir,generated_dir)

        filename = os.path.basename(generated)

        self.assertEqual(filename, "asdf")
Beispiel #8
0
    def upload(self):
        """Uploads the contents of the given file by parsing it as an ijson
        stream.

        Prints out ending message regarding number of genomes processed and
        errors encountered
        """
        with open(generate_path(self.filename), "r") as fd:
            data = ijson.parse(fd)
            self.parse_metadata(data)

        print "%d genomes parsed, %d errors occurred." % (self.progress, self.error)
Beispiel #9
0
 def setup_curated_data(self):
     """
     Converts all curated data stored in JSON format into a turtle file
     ready for uploading into Blazegraph
     """
     self.convert_host_categories()
     self.convert_hosts()
     self.convert_microbes()
     self.convert_sources()
     self.convert_syndromes()
     self.generate_serotypes()
     generate_file_output(self.graph, generate_path('ontologies/setup.ttl'))
Beispiel #10
0
    def error_logging(self, name):
        """Records the trackback of any error messages to an log file so that
        if any are encountered, the log file will
        retain pertinent information for debugging

        Args:
            name(str): The genome that is currently being uploaded
        """
        self.error += 1
        with open(generate_path("outputs/errors.txt"), "a") as file_:
            file_.write("%s \n\n %s \n " "================================ \n\n" % (name, traceback.format_exc()))

        print "Error %d occurred." % self.error
Beispiel #11
0
    def from_ftp(self, seqdata):
        """Obtains the FASTA sequence via the NCBI FTP server in the WGS genome
        pipeline and labels the sequence as being from the WGS piepline.

        Args:
            seqdata: a SequenceMetadata instance storing sequence-related data
            that would otherwise be a data clump
        """
        seq_id = strip_non_alphabetic(str(seqdata.accession))
        self.download_file(seq_id, 'fsa_nt.gz')

        with open(generate_path('tmp/loading.fasta'), 'rb') as handle:
            self.read_fasta(handle, seqdata)
        seqdata.dict["is_from"] = "WGS"
Beispiel #12
0
    def get_seqdata(self, contigswrapper):
        """
        Args:
            contigswrapper: a ContigsWrapper instance that holds contig metadata for a genome

        Returns: a BLAST record for self.load_contigs to use
        """
        Entrez.email = "*****@*****.**"
        handle = None
        i = 0

        while i < 3:
            try:
                print "Getting data from Entrez..."
                handle = Entrez.efetch(
                    db="nuccore", id=contigswrapper.genome,
                    rettype="fasta", retmode="text"
                )
                for record in SeqIO.parse(handle, 'fasta'):
                    if "complete" in record.description.lower():
                        contigswrapper.dict["is_from"] = "CORE"
                        print "Getting data from Entrez..."
                        handle = Entrez.efetch(
                            db="nuccore", id=contigswrapper.genome,
                            rettype="fasta", retmode="text"
                        )
                        self.load_contigs(handle, contigswrapper)
                        break
                    else:
                        print "Downloading data from WGS"
                        self.download_file(
                            strip_non_alphabetic(str(contigswrapper.genome)),
                            'fsa_nt.gz'
                        )
                        with open(
                            generate_path('tmp/loading.fasta'),
                            'rb'
                        ) as handle:
                            contigswrapper.dict["is_from"] = "WGS"
                            self.load_contigs(handle, contigswrapper)
            except HTTPError:
                i += 1
                continue
            break
        try:
            handle is None
        except NameError:
            raise TypeError("Could not retrieve file for analysis")
Beispiel #13
0
    def upload_all_ontologies(cls):
        """
        Uploads all ontologies in the specified folder.

        The format of the ontology is automatically interpreted by Blazegraph
        based on the file extension. If any format fails, it is probably
        because of an extension mismatch (for example, Turtle files are not
        .owl as the WC3 standardized file format for RDF and OWL is RDF/XML.

        """
        folder = generate_path("ontologies")
        files = os.listdir(folder)
        for file_ in files:
            path = os.path.join(folder, file_)
            print "importing %s" % file
            file_update(path)
Beispiel #14
0
    def import_json(cls, filename):
        """
        Imports JSON data from the specified file into Python

        Args:
            filename (str): the relative filepath to this python function

        Returns: a Python object composed of the data from the JSON data

        """
        path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            filename
        )
        with open(generate_path(path), "r+") as file_:
            return json.load(file_)
Beispiel #15
0
    def error_logging(cls, contigswrapper):
        """
        Logs errors regarding contig uploading to a file, for manual
        curation.

        Args:
            contigswrapper: a ContigsWrapper instance storing sequence-related
            data that would otherwise be a data clump
        """
        with open(generate_path("outputs/seq_errors.txt"), "a") as file_:
            file_.write("Genome: %s - Accession: %s.\n" % (
                contigswrapper.genome, contigswrapper.accession))
            file_.write("%s \n ================================ \n\n" % (
                traceback.format_exc()))
        print "%s - %s: The records for this sequence are not retrievable." % (
            contigswrapper.genome, contigswrapper.accession
        )