def gff_view(params): """ Converts data to fastya """ print("##gff-version 3") for param in params: # Stop when data was not found. if not param.json: utils.error(f"data not found: {param.name}") # Each data may have multiple entries. for item in param.json: # Pull out the features. feats = item[const.FEATURES] # The name of the GFF anchor. anchor = param.seqid or item['id'] # Subselect by coordinates. feats = jsonrec.filter_features(feats, start=param.start, end=param.end, gene=param.gene, ftype=param.type, regexp=param.regexp) # Generate the gff output for feat in feats: values = feature2gff(feat, anchor=anchor) values = map(str, values) print("\t".join(values))
def genome(name, fname, update=False, genbank={}, refseq={}, summary=ASSEMBLY_FILE_NAME, jsondb=ASSEMBLY_JSON_DB): """ Parse and search and assembly file for an accession number. """ # Update assembly information if it is missing. if not os.path.isfile(summary): update = True # When update is true get the assembly summary file again. if update: logger.info("updating assembly summary") download_assembly() if not os.path.isfile(jsondb): utils.error("json db needs to be built") urlpath = genbank.get(name) or refseq.get(name) # Read the file line by line. if urlpath: download_file(url=urlpath, dest=fname) else: # If we go this far we have not found the data. print(f'*** accession not found: {name}')
def search(term, db='sra', tabular=False, limit=None): limit = 10000 if not limit else limit env = entrez.esearch(db=db, term=term, usehistory="y") data = entrez.efetch(db=db, env=env, retmax=limit, rettype="runinfo") elems = data.get('SraRunInfo', {}).get("Row", {}) if not elems: utils.error("the query at SRA has not returned results.") if tabular and elems: fieldnames = elems[0].keys() writer = csv.DictWriter(sys.stdout, delimiter="\t", fieldnames=fieldnames) writer.writeheader() for row in elems: writer.writerow(row) else: pprint(elems) return data
def gff_view(params): """ Converts data to fastya """ print("##gff-version 3") for param in params: # GFF is a interval (record mode). param.record = True # Stop when data was not found. if not param.json: utils.error(f"data not found: {param.acc}") # Each data may have multiple entries. for item in param.json: # Pull out the features. feats = jsonrec.get_json_features(item) # The name of the GFF anchor. anchor = param.seqid or item['id'] # Subselect by coordinates. feats = jsonrec.filter_features(feats, param=param) # Generate the gff output for feat in feats: for values in feature2gff(feat, anchor=anchor, allow_parent=not (param.type)): values = map(str, values) print("\t".join(values))
def parse_file(fname, seqid=None): """ Parses a recognized file into a JSON representation """ logger.info(f"parsing {fname}") if not os.path.exists(fname): logger.warning(f"File does not exist: {fname}") return # Handle both compressed and uncompressed formats. stream = gzip.open(fname, 'rt') if fname.endswith(".gz") else open( fname, 'rt') # Detect extentions name, ext = os.path.splitext(fname) ext = ext.lower() # Split extension one more time if it looks like a compressed file. if ext == ".gz": name, ext = os.path.splitext(name) ext = ext.lower() # Cascade over the known file formats. if ext in (".gb", ".gbk", ".genbank"): recs = SeqIO.parse(stream, format=const.GENBANK) data = convert_genbank(recs, seqid=seqid) elif ext in (".fa", ".fasta"): recs = SeqIO.parse(stream, format=const.FASTA) data = convert_fasta(recs, seqid=seqid) else: utils.error(f"file format not recognized: {fname}") return data
def parse_data(fname, study_size=10): """ Take a .gaf file and return an association dictionary and population dict. """ if not os.path.isfile(fname): utils.error("Association file needs to be downloaded first.") # Read the population from file association = {} population = set() stream = utils.gz_read(fname, 'r') print(f"*** parsing {fname}") # Get the gene from each row for line in stream: line = line.decode() if line.startswith('!'): continue gene = line.split('\t')[2] goterm = line.split('\t')[4] association.setdefault(gene, set()).update([goterm]) population.update([gene]) return population, association
def search_names(word, archive=TAXDB_NAME, name="names.dmp", limit=None): """ Processes the names.dmp component of the taxdump. """ # Needs a taxdump to work. if not os.path.isfile(archive): utils.error("taxdump file not found (download and build it first)") # Open stream into the tarfile. stream = open_tarfile(archive=archive, filename=name, limit=limit) # The pattern may be regular expression. patt = re.compile(word, re.IGNORECASE) # Labels that will be searched. valid = {'scientific name', 'equivalent name', 'genbank common name'} def select(row): taxid, name, label = row[0], row[2], row[6] return label in valid and patt.search(name) # Apply the selector. stream = filter(select, stream) for elems in stream: taxid, name, label = elems[0], elems[2], elems[6] yield taxid, name
def json_view(params): """ Prints json output to """ for param in params: # Stop when data was not found. if not param.json: utils.error(f"data not found: {param.name}") # Produce the full file when no parameters are set. if param.unset(): text = json.dumps(param.json, indent=4) print(text) else: # Selects individual features. for item in param.json: feats = item[const.FEATURES] feats = filter_features(feats, start=param.start, end=param.end, ftype=param.type, gene=param.gene, regexp=param.regexp) text = json.dumps(list(feats), indent=4) print(text)
def run(start=1, end='', mode=LOCAL_ALIGN, gap_open=11, gap_extend=1, protein=False, translate=False, inter=False, verbose=False, query='', target=''): """ Handles an alignment request. """ # Set the verbosity of the process. utils.set_verbosity(logger, level=int(verbose)) # Ensure counter is reset. jsonrec.reset_counter() # Requires two inputs. if not (query and target): utils.error(f"Please specify both a QUERY and a TARGET") param1 = objects.Param(name=query, protein=protein, translate=translate, start=start, end=end, gap_open=gap_open, gap_extend=gap_extend, mode=mode) param2 = objects.Param(name=target, protein=protein, translate=translate, start=start, end=end, gap_open=gap_open, gap_extend=gap_extend, mode=mode) # Get the JSON data. param1.json = storage.get_json(param1.name, inter=inter, strict=True) param2.json = storage.get_json(param2.name, inter=inter, strict=True) for rec1 in param1.json: for rec2 in param2.json: qrecs = fastarec.get_fasta(rec1, param=param1) trecs = fastarec.get_fasta(rec2, param=param2) for qseq in qrecs: for tseq in trecs: parasail_align(qseq=qseq, tseq=tseq, param=param1)
def get_data(preload=False): if preload: if not os.path.isfile(JSON_DB): utils.error( f"ontology file not found (you must build it first): {JSON_DB}" ) store = json.load(open(JSON_DB)) terms = store[TERM] else: terms = open_db(TERM) return terms
def get_data(preload=False): if preload: if not os.path.isfile(JSON_DB): utils.error( f"taxonomy file not found (you must build it first): {JSON_DB}" ) store = json.load(open(JSON_DB)) names = store[NAMES] graph = store[GRAPH] else: names = open_db(NAMES) graph = open_db(GRAPH) return names, graph
def genbank_view(params): for param in params: altname = resolve_fname(param.acc, format="gb") if os.path.isfile(param.acc): stream = utils.gz_read(param.acc) elif os.path.isfile(altname): stream = utils.gz_read(altname) else: stream = [] utils.error(f"data not found: {param.acc}") for line in stream: print(line, end='')
def fetch_genbank(acc, dest_name): """ Returns a genbank file. """ try: db = 'nuccore' rettype, retmode = "gbwithparts", "text" params = dict(db=db, rettype=rettype, id=acc, retmode=retmode) utils.download(EFETCH_URL, params=params, dest_name=dest_name) except Exception as exc: utils.error(exc)
def get_data(preload=False, acc=False): """ Returns the graph structure for the database. """ if preload: if not os.path.isfile(JSON_DB): utils.error(f"taxonomy file not found (you must build it first): {JSON_DB}") store = json.load(open(JSON_DB)) names = store[TAXID] graph = store[GRAPH] else: names = open_db(TAXID) graph = open_db(GRAPH) return names, graph
def get_data(preload=False): if preload: if not os.path.isfile(JSON_DB): utils.error(f"ontology file not found (you must build it first): {JSON_DB}") store = json.load(open(JSON_DB)) terms = store[TERM] nodes = store[GRAPH] names = store[NAMES] back = store[CHILDREN] else: terms = utils.open_db(TERM, fname=SQLITE_DB) nodes = utils.open_db(GRAPH, fname=SQLITE_DB) names = utils.open_db(NAMES, fname=SQLITE_DB) back = utils.open_db(CHILDREN, fname=SQLITE_DB) return terms, nodes, names, back
def build_database(fname=TAXDB_NAME, limit=None): """ Downloads taxdump file. """ print(f"*** building database from: {fname}") path = os.path.join(utils.DATADIR, fname) # Check the file. if not os.path.isfile(path): utils.error(f"no taxdump file found, run the --download flag") # Parse the names name_dict = parse_names(fname, limit=limit) # Parse the nodes. node_dict, back_dict = parse_nodes(fname, name_dict=name_dict, limit=limit) def save_table(name, obj): size = len(obj) table = open_db(table=name, flag='w') for index, (key, value) in enumerate(obj.items()): table[key] = value if index % CHUNK == 0: perc = round(index / size * 100) print( f"*** saving {name} with {size:,} elements ({perc:.0f}%)", end="\r") table.commit() print(f"*** saved {name} with {size:,} elements (100%)", end="\r") print("") table.commit() table.close() # Save the names into the database save_table(NAMES, name_dict) # Save the nodes. save_table(GRAPH, node_dict) print("*** saving the JSON model") json_path = os.path.join(utils.DATADIR, JSON_DB) # JSON will only have the graph and names. store = dict(NAMES=name_dict, GRAPH=node_dict, SYNONYMS={}, BACK={}) fp = open(json_path, 'wt') json.dump(store, fp, indent=4) fp.close()
def filter_file(stream, terms, keep, remove, graph, colidx=0): """ Filters a file to retain only the rows where a taxid is ina subtree. """ if not stream: if len(terms) == 0: msg = f"filtering needs an input stream or a filename" utils.error(msg) stream = open(terms[0]) # Collects all children of the taxids. keep_dict, remove_dict = {}, {} # Taxids to keep keeper = keep.split(",") # Fill the keeper dictionary. for term in keeper: dfs_visitor(graph=graph, node=term, visited=keep_dict) # Fill the remover dictionary. remover = remove.split(",") for term in remover: dfs_visitor(graph=graph, node=term, visited=remove_dict) # Read the stream. reader = csv.reader(stream, delimiter="\t") # Selection condition. def keep_func(row): taxid = row[colidx] return taxid in keep_dict def remove_func(row): taxid = row[colidx] return taxid not in remove_dict # What to keep. if keep: reader = filter(keep_func, reader) # What to remove. if remove: reader = filter(remove_func, reader) # Generate the output. writer = csv.writer(sys.stdout, delimiter="\t") writer.writerows(reader)
def build_database(fname=TAXDB_NAME, limit=None): """ Downloads taxdump file. """ print(f"*** building database from: {fname}") path = os.path.join(utils.DATADIR, fname) # Check the file. if not os.path.isfile(path): utils.error(f"no taxdump file found, run the --download flag") # Get the assembly. _, _, taxon_acc = ncbi.parse_summary() # Parse the names name_dict, latin_dict = parse_names(fname, limit=limit, taxon_acc=taxon_acc) # Parse the nodes. node_dict, back_dict = parse_nodes(fname, name_dict=name_dict, limit=limit) def save_table(name, obj): utils.save_table(name=name, obj=obj, fname=SQLITE_DB) # Save the names into the database save_table(NAMES, name_dict) # Save the nodes. save_table(GRAPH, node_dict) # Save the latin names. save_table(LATIN, latin_dict) print("*** saving the JSON model") json_path = os.path.join(utils.DATADIR, JSON_DB) # JSON will only have the graph and names. store = dict(NAMES=name_dict, GRAPH=node_dict, SYNONYMS={}, BACK={}, LATIN=latin_dict) fp = open(json_path, 'wt') json.dump(store, fp, indent=4) fp.close()
def get_json(name, seqid=None, update=False, inter=False, strict=False): """ Attempts to return a JSON formatted data based on a name. """ # Data is an existing path to a file. if os.path.isfile(name): data = jsonrec.parse_file(name, seqid=seqid) return data # Not a local file, attempt to resolve to storage. # Report as not found if update is requested. if update: return None # The JSON representation of the data. json_name = resolve_fname(name=name, format="json") # GenBank representation of the data. gbk_name = resolve_fname(name=name, format="gb") # Found the JSON representation of the file. if os.path.isfile(json_name): logger.info(f"found {json_name}") data = read_json_file(json_name) return data # No JSON file but there is a genbank file. if os.path.isfile(gbk_name): logger.info(f"found {gbk_name}") data = jsonrec.parse_file(fname=gbk_name, seqid=seqid) data = save_json_file(fname=json_name, data=data) return data # If not found and interactive mode create a JSON from the name itself. if inter: data = jsonrec.make_jsonrec(seq=name, seqid=seqid) return data # At this point the data was not found if strict: utils.error(f"data not found: {name}") return None
def json_view(params): """ Prints json output to """ for param in params: # Stop if data was not found. if not param.json: utils.error(f"data not found: {param.acc}") # Override the sequence ids for every record. if param.seqid: for rec in param.json: rec[const.SEQID] = param.seqid # Produce the a nicely indended JSON representation. text = json.dumps(param.json, indent=4) print(text)
def modify_record(seq, param): """ Modifies a sequence record based on parameters. """ # Shortcuts to coordinates. start, end = param.start, param.end # Words are added to description to keep track of operations desc = [] # Slice the sequence. if start != 0 or end: # Don't exceed sequence length. end = len(seq) if not end else min(end, len(seq)) seq = seq[start:end] desc.append(f'[{start + 1}:{end}]') try: # Possible sequence transformations. if param.revcomp: seq = seq.reverse_complement() desc.append("reverse-complemented") if param.reverse: seq = seq[::-1] desc.append("reversed") if param.complement: seq = seq.complement() desc.append("complemented") if param.translate: seq = seq.translate() desc.append("translated") if param.transcribe: seq = seq.transcribe() desc.append("transcribed DNA") except Exception as exc: utils.error(exc) return seq, desc
def fasta_view(params): """ Converts data to fastya """ for param in params: # Stop when data was not found. if not param.json: utils.error(f"data not found: {param.acc}") # Each data may have multiple entries. for item in param.json: # Get the fasta for each entry. recs = get_fasta(item, param=param) # Print the fasta records. print_fasta(recs)
def get_json(name, seqid=None, inter=False, strict=False): """ Attempts to return a JSON formatted data based on a name. """ # Data is an existing path to a JSON file. if os.path.isfile(name): try: data = jsonrec.parse_file(name, seqid=seqid) except Exception as exc: logger.error(f"JSON parsing error for file {name}: {exc}") sys.exit(-1) return data # The JSON representation of the data. json_name = resolve_fname(name=name, format="json") # GenBank representation of the data. gbk_name = resolve_fname(name=name, format="gb") # Found the JSON representation of the file. if os.path.isfile(json_name): logger.info(f"found {json_name}") data = read_json_file(json_name) return data # There is no JSON file but there is a GenBank file. if os.path.isfile(gbk_name): logger.info(f"found {gbk_name}") data = jsonrec.parse_file(fname=gbk_name, seqid=seqid) data = save_json_file(fname=json_name, data=data) return data # Interactive input, make JSON from name if inter: data = jsonrec.make_jsonrec(name, seqid=seqid) return data # Raise error if in strict mode if strict: utils.error(f"data not found: {name}") return None
def get_data(preload=False, acc=False): if preload: if not os.path.isfile(JSON_DB): utils.error( f"taxonomy file not found (you must build it first): {JSON_DB}" ) store = json.load(open(JSON_DB)) names = store[NAMES] graph = store[GRAPH] latin = store[LATIN] else: names = open_db(NAMES) graph = open_db(GRAPH) latin = open_db(LATIN) if acc: _, taxon_acc, _ = ncbi.get_data() else: taxon_acc = {} return names, graph, taxon_acc, latin
def make_param(name): """ Creates a parameter for each accession. """ # Set the verbosity utils.set_verbosity(logger, level=int(verbose)) # A very common error to pass a fragment as if name.startswith("-"): msg = f"Invalid accession number: {name}" utils.error(msg) # A simple wrapper class to carry all parameters around. p = objects.Param(start=start, end=end, seqid=seqid, protein=protein, revcomp=revcomp, update=update, name=name, gff=gff, translate=translate, reverse=reverse, complement=complement, fasta=fasta, type=type, gene=gene, regexp=match, transcribe=transcribe) # Fill the json data for the parameter. p.json = storage.get_json(p.name, seqid=seqid, inter=inter) return p
def search_names(word, fname=TAXDB_NAME, name="names.dmp", limit=None): """ Parses the names.dmp component of the taxdump. """ if not os.path.isfile(fname): utils.error("taxdump file not found (download and build it first)") # The taxdump file. tar = tarfile.open(fname, "r:gz") stream = get_stream(tar=tar, name=name, limit=limit) stream = csv.reader(stream, delimiter="\t") patt = re.compile(word, re.IGNORECASE) for index, elems in enumerate(stream): taxid, name, label = elems[0], elems[2], elems[6] if label == 'scientific name' or label == 'equivalent name' or label == 'genbank common name': if patt.search(name): yield taxid, name
def build_database(archive=TAXDB_NAME, limit=None): """ Downloads taxdump file. """ print(f"*** building database from: {archive}") # The location of the archive. path = os.path.join(utils.DATADIR, archive) # Download the latest taxdump file. update_taxdump() # Check the file. if not os.path.isfile(path): utils.error(f"no taxdump file found") # Parse the names tax2data = parse_names(archive, limit=limit) # Parse the nodes and backpropagation. graph = parse_nodes(archive, tax2data=tax2data, limit=limit) # A shortcut to the function. def save_table(name, obj): utils.save_table(name=name, obj=obj, fname=SQLITE_DB) # Save the taxid definitions. save_table(TAXID, tax2data) # Save the graph. save_table(GRAPH, graph) print("*** saving the JSON model") json_path = os.path.join(utils.DATADIR, JSON_DB) # Save the JSON file as well. store = dict(TAXID=tax2data, GRAPH=graph) fp = open(json_path, 'wt') json.dump(store, fp, indent=4) fp.close()
def run(start=1, end='', gap_open=11, gap_extend=1, local_=False, global_=False, semiglobal=False, protein=False, translate=False, inter=False, table=False, mutations=False, strict=False, pep1=False, pep3=False, limit=1, verbose=False, target=None, query=None): """ Performs an alignment between the query and target. """ # Alignments over this size will take a long time! MAX_LEN = 100000 # Set the verbosity of the process. utils.set_verbosity(logger, level=int(verbose)) # Reset counter (needed for consistency during testing). jsonrec.reset_sequence_names() # This method requires two inputs. if not (query and target): utils.error(f"Please specify a TARGET and a QUERY") if global_: mode = const.GLOBAL_ALIGN elif local_: mode = const.LOCAL_ALIGN elif semiglobal: mode = const.SEMIGLOBAL_ALIGN else: mode = const.GLOBAL_ALIGN # A parameter for each record. common = dict( protein=protein, translate=translate, mutations=mutations, pep1=pep1, pep3=pep3, table=table, strict=strict, start=start, end=end, gap_open=gap_open, gap_extend=gap_extend, mode=mode ) # Create parameters to represent each data. param_t = objects.Param(acc=target, **common) param_q = objects.Param(acc=query, **common) # Fill JSON data for parameters. param_t.json = fetch.get_json(param_t.acc, inter=inter, strict=True)[:limit] param_q.json = fetch.get_json(param_q.acc, inter=inter, strict=True)[:limit] # Each data object may contain several records. # # For more than one record we iterate in pairs # for rec1, rec2 in zip(param_q.json, param_t.json): qrecs = fastarec.get_fasta(rec1, param=param_q) trecs = fastarec.get_fasta(rec2, param=param_t) for qseq, tseq in zip(qrecs, trecs): if (len(qseq) > MAX_LEN): utils.error(f"query is longer than maximum: {len(qseq):,} > {MAX_LEN:,}") if (len(tseq) > MAX_LEN): utils.error(f"target sequence is longer than maximum: {len(tseq):,} > {MAX_LEN:,}") biopython_align(qseq=qseq, tseq=tseq, param=param_q)
def ncbi_efetch(name, gbk_name, db=None): """ Connects to Entrez Direct to download data. """ # Get the entire GenBank file. format, retmode = "gbwithparts", "text" # Guess accession numbers that are proteins. if name[:2] in ["AP", "NP", "YP", "XP", "WP", "AK"]: db = db or "protein" else: db = db or "nuccore" try: logger.info(f"connecting to Entrez for {name}") stream = Entrez.efetch(id=name, db=db, rettype=format, retmode=retmode) except Exception as exc: msg = f"{exc} for efetch acc={name} db={db} format={format} mode={retmode}" utils.error(msg) # Save the stream to GenBank. utils.save_stream(stream=stream, fname=gbk_name)
def get_metadata(taxid, limit=None): """ Returns all accessions """ import requests # The dataset accession point. url = f"https://api.ncbi.nlm.nih.gov/datasets/v1alpha/virus/taxon/{taxid}/genome/table" params = { 'format': 'tsv', 'refseq_only': "false", 'complete_only': 'true', 'table_fields': [ 'host_tax_id', 'species_tax_id', 'nucleotide_accession', 'collection_date', 'geo_location', 'isolate_name', ] } conn = requests.get(url, stream=True, params=params) lines = conn.iter_lines() lines = islice(lines, limit) if conn.status_code != 200: msg = f"HTTP status code: {conn.status_code}" utils.error(msg) lines = map(decode, lines) return lines