def download_file(cls, id_, filetype): """ Downloads the gzip file with the correct id and filetype and unzips it and transfers its contents into a temporary FASTA file for further processing. If no files on the server match, returns a TypeError. Args: id(str): a WGS project ID, composed of only alphabetics filetype(str): the type of file to be found. 'fsa_nt.gz' is the default, but there are other options for amino acids and other formats """ ftp = FTP('bio-mirror.jp.apan.net') ftp.login('anonymous', '*****@*****.**') ftp.cwd('pub/biomirror/genbank/wgs') filenames = ftp.nlst() filename = [s for s in filenames if id_ in s and filetype in s] if len(filename) is not 1: raise TypeError("No files could be found for download.") else: ftp.retrbinary( 'RETR ' + filename[0], open(generate_path('tmp/loading.gz'), 'wb').write ) with gzip.open(generate_path('tmp/loading.gz')) as fasta, \ open(generate_path('tmp/loading.fasta'), 'wb') as output: output.write(fasta.read())
def upload_genes(self): with open(generate_path(self.filename), "r") as file_: data = json.load(file_) if self.kind == "virulence_factor": self.parse_vf(data) elif self.kind == "antimicrobial_resistance": self.parse_amr(data)
def validate(self): """Handles the whole sequence validation process. After obtaining the results for each check, it determines how the sequence should be handled in sequence uploading by modifying the associated SequenceMetadata object. TODO: refactor this more for clarity and ease of testing? """ self.filter_passing_hits() checks = {"number of hits":self.check_hits(), "base pair count":self.check_bp(), "contig count":self.check_contigs(), "characters": self.check_chars(), "checksum":not check_checksum(self.seqdata.checksum)} failed_checks = {(k, v) for k, v in checks.iteritems() if v is False} if failed_checks: """ replace this with logger, break would be replaced by a raised Exception where the Exception would be caught by the Sequence_Upload code """ for k, v in failed_checks: with open(generate_path("outputs/seq_errors.txt"), "a") as file_: file_.write( '%s failed validation:' 'the %s was not valid\n' %(self.seqdata.accession, k) ) self.seqdata.valid = False else: self.seqdata.valid = True
def blastn_commandline(cls): """Runs a command line BLAST on the generated FASTA sequence using the database composed of 10 E. coli species-specific genomic regions and outputs the results into XML format into another file. """ command = generate_path("../../blast/ncbi-blast*/bin/blastn") fasta = generate_path("tmp/validate.fasta") db = generate_path("data/blast/ValidationDB") results = generate_path("tmp/validate.xml") subprocess.call( '%s -query %s -db %s -outfmt 5 -out %s -best_hit_score_edge 0.05 ' '-best_hit_overhang 0.1' % ( command, fasta, db, results ), shell=True )
def filter_passing_hits(self): """ Reads the result from the command line BLAST using fileIO and parses it to look for the top scoring hits at 90% and above. If there are multiple hits, select the highest scoring one. """ self.create_fasta() self.blastn_commandline() hits = {} result_handle = open(generate_path("tmp/validate.xml")) for record in NCBIXML.parse(result_handle): for entry in record.alignments: hit = entry.hit_def seqlen = entry.length hsp = entry.hsps[0] percent_ident = (float(hsp.positives) / float(seqlen)) * 100 if 90 <= percent_ident <= 100: if hit in hits: if percent_ident > hits[hit]: hits[hit] = percent_ident else: hits[hit] = percent_ident del result_handle self.seqdata.hits = hits
def create_fasta(self): """Writes a FASTA sequence to a file for use by the command line version of BLAST. Obtains nucleotide data from the sequence data object used to initialize the validator and writes each entry as a separate FASTA object. Contigs from WGS samples must be kept separate to avoid false matches based on misaligned sequences. """ with open(generate_path("tmp/validate.fasta"), "w") as file_: for (accession_name, seq) in self.seqdata.contigs: file_.write(">%s\n%s\n" %(self.seqdata.accession, seq))
def test_generate_path(self): (frame, filepath, line_number, function_name, lines, index) = inspect.stack()[0] expected_dir = os.path.dirname(filepath) generated = _utils.generate_path("asdf") generated_dir = os.path.dirname(generated) self.assertEqual(expected_dir,generated_dir) filename = os.path.basename(generated) self.assertEqual(filename, "asdf")
def upload(self): """Uploads the contents of the given file by parsing it as an ijson stream. Prints out ending message regarding number of genomes processed and errors encountered """ with open(generate_path(self.filename), "r") as fd: data = ijson.parse(fd) self.parse_metadata(data) print "%d genomes parsed, %d errors occurred." % (self.progress, self.error)
def setup_curated_data(self): """ Converts all curated data stored in JSON format into a turtle file ready for uploading into Blazegraph """ self.convert_host_categories() self.convert_hosts() self.convert_microbes() self.convert_sources() self.convert_syndromes() self.generate_serotypes() generate_file_output(self.graph, generate_path('ontologies/setup.ttl'))
def error_logging(self, name): """Records the trackback of any error messages to an log file so that if any are encountered, the log file will retain pertinent information for debugging Args: name(str): The genome that is currently being uploaded """ self.error += 1 with open(generate_path("outputs/errors.txt"), "a") as file_: file_.write("%s \n\n %s \n " "================================ \n\n" % (name, traceback.format_exc())) print "Error %d occurred." % self.error
def from_ftp(self, seqdata): """Obtains the FASTA sequence via the NCBI FTP server in the WGS genome pipeline and labels the sequence as being from the WGS piepline. Args: seqdata: a SequenceMetadata instance storing sequence-related data that would otherwise be a data clump """ seq_id = strip_non_alphabetic(str(seqdata.accession)) self.download_file(seq_id, 'fsa_nt.gz') with open(generate_path('tmp/loading.fasta'), 'rb') as handle: self.read_fasta(handle, seqdata) seqdata.dict["is_from"] = "WGS"
def get_seqdata(self, contigswrapper): """ Args: contigswrapper: a ContigsWrapper instance that holds contig metadata for a genome Returns: a BLAST record for self.load_contigs to use """ Entrez.email = "*****@*****.**" handle = None i = 0 while i < 3: try: print "Getting data from Entrez..." handle = Entrez.efetch( db="nuccore", id=contigswrapper.genome, rettype="fasta", retmode="text" ) for record in SeqIO.parse(handle, 'fasta'): if "complete" in record.description.lower(): contigswrapper.dict["is_from"] = "CORE" print "Getting data from Entrez..." handle = Entrez.efetch( db="nuccore", id=contigswrapper.genome, rettype="fasta", retmode="text" ) self.load_contigs(handle, contigswrapper) break else: print "Downloading data from WGS" self.download_file( strip_non_alphabetic(str(contigswrapper.genome)), 'fsa_nt.gz' ) with open( generate_path('tmp/loading.fasta'), 'rb' ) as handle: contigswrapper.dict["is_from"] = "WGS" self.load_contigs(handle, contigswrapper) except HTTPError: i += 1 continue break try: handle is None except NameError: raise TypeError("Could not retrieve file for analysis")
def upload_all_ontologies(cls): """ Uploads all ontologies in the specified folder. The format of the ontology is automatically interpreted by Blazegraph based on the file extension. If any format fails, it is probably because of an extension mismatch (for example, Turtle files are not .owl as the WC3 standardized file format for RDF and OWL is RDF/XML. """ folder = generate_path("ontologies") files = os.listdir(folder) for file_ in files: path = os.path.join(folder, file_) print "importing %s" % file file_update(path)
def import_json(cls, filename): """ Imports JSON data from the specified file into Python Args: filename (str): the relative filepath to this python function Returns: a Python object composed of the data from the JSON data """ path = os.path.join( os.path.dirname(os.path.realpath(__file__)), filename ) with open(generate_path(path), "r+") as file_: return json.load(file_)
def error_logging(cls, contigswrapper): """ Logs errors regarding contig uploading to a file, for manual curation. Args: contigswrapper: a ContigsWrapper instance storing sequence-related data that would otherwise be a data clump """ with open(generate_path("outputs/seq_errors.txt"), "a") as file_: file_.write("Genome: %s - Accession: %s.\n" % ( contigswrapper.genome, contigswrapper.accession)) file_.write("%s \n ================================ \n\n" % ( traceback.format_exc())) print "%s - %s: The records for this sequence are not retrievable." % ( contigswrapper.genome, contigswrapper.accession )