def check_prodigal_results(prodigal_results, output_directory): """ Determines if Prodigal could not predict genes for any input assembly. Parameters ---------- prodigal_results : list List with gene prediction results from Prodigal. output_directory : str Path to the output directory where the file with information about failed cases will be written to. Returns ------- A list with the following elements: failed : list List with the stderr for the cases that Prodigal failed to predict genes for. failed_file : str Path to the file with information about the failed cases. """ no_cds = [l for l in prodigal_results if l[1] == 0] errors = [l for l in prodigal_results if isinstance(l[1], str) is True] failed = no_cds + errors failed_file = os.path.join(output_directory, 'prodigal_stderr.tsv') if len(failed) > 0: lines = ['{0}\t{1}'.format(l[0], l[1]) for l in failed] fo.write_lines(lines, failed_file) return [failed, failed_file]
def unzip_file(compressed_file, archive_type='.gz'): """ Uncompresses a file. Parameters ---------- compressed_file : str Path to the compressed file. archive_type : str Archive format. Returns ------- uncompressed_file : str Path to the uncompressed file. """ lines = [] with gzip.open(compressed_file, 'rb') as f: for line in f: lines.append(line.decode()) # save uncompressed contents uncompressed_file = compressed_file.rstrip('.gz') fo.write_lines(lines, uncompressed_file, joiner='') return uncompressed_file
def save_extracted_cds(genome, identifier, orf_file, protein_table, cds_file): """ Extracts coding sequences from a genome assembly based on Prodigal's gene predictions. Writes coding sequences to a FASTA file and information about coding sequences to a TSV file. Parameters ---------- genome : str Path to the FASTA file with the FASTA sequences for a genome. identifier : str Genome identifier to add to FASTA records headers and to the first field in the TSV file. orf_file : str Path to the file with Prodigal results. protein_table : str Path to the TSV file to which coding sequences information will be written. cds_file : str Path to the FASTA file to which coding sequences will be written. Returns ------- total_cds : int Total number of coding sequences extracted from the genome. """ # import contigs for current genome/assembly contigs = fao.import_sequences(genome) # extract coding sequences from contigs reading_frames = fo.pickle_loader(orf_file) genome_info = extract_genome_cds(reading_frames, contigs, 1) # save coding sequences to file # create records and write them to file cds_lines = fao.create_fasta_lines(genome_info[0], identifier) fo.write_lines(cds_lines, cds_file) write_protein_table(protein_table, identifier, genome_info[1]) total_cds = len(genome_info[0]) return total_cds
def integer_headers(input_fasta, output_fasta, start=1, limit=5000): """ Switches FASTA records headers in a file by integer values. Parameters ---------- input_fasta : str Path to the a FASTA file. output_fasta : str Path to the output file with modified headers. start : int Integer value of first identifier. limit : int Maximum number of FASTA records to keep in memory. Returns ------- ids_map : dict Dictionary with mapping between integer and original headers. """ seqs = [] ids_map = {} exausted = False seq_generator = SeqIO.parse(input_fasta, 'fasta') while exausted is False: record = next(seq_generator, None) if record is not None: new_id = 'seq_{0}'.format(start) ids_map[new_id] = record.id sequence = str(record.seq) new_rec = '>{0}\n{1}'.format(new_id, sequence) seqs.append(new_rec) start += 1 elif record is None: exausted = True if len(seqs) == limit or exausted is True: fo.write_lines(seqs, output_fasta) seqs = [] return ids_map
def predict(model, subset_for_prediction, targets, filename): predictor = Predictor(model) predictions = predictor.predict(subset_for_prediction)['pred'] words = list(subset_for_prediction.get_field('raw_words')) lines = [] # print(predictions) # print(f'predicted labels for {len(predictions)}/{len(words)} items') words_sequence_index = 1 labels_sequence_index = 0 for sentence in list(zip(predictions, words)): if type(sentence[labels_sequence_index][0]) == int: continue words = sentence[words_sequence_index] print(sentence[labels_sequence_index]) labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0]) for pair in zip(words, labels): lines.append('\t'.join(pair)) lines.append('') write_lines(filename, lines)
def _predict(self, subset_for_prediction, targets, filename): predictor = Predictor(self) predictions = predictor.predict(subset_for_prediction)['pred'] words = list(subset_for_prediction.get_field('raw_words')) lines = [] words_sequence_index = 1 labels_sequence_index = 0 for sentence in list(zip(predictions, words)): if type(sentence[labels_sequence_index][0]) == int: continue words = sentence[words_sequence_index] #print(sentence[labels_sequence_index]) #labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0]) labels = map(lambda label: f'{targets.to_word(label)}', sentence[labels_sequence_index][0]) for pair in zip(words, labels): lines.append(' '.join(pair)) lines.append('') if filename is not None: write_lines(filename, lines) return lines
def translate_fastas(fasta_paths, output_directory, translation_table): """ Translates DNA sequences in a set of FASTA files. Parameters ---------- fasta_paths : list List with the paths to the FASTA files that contain the DNA sequences to translate. output_directory : str Path to the output directory where FASTA files with protein sequences will be writen to. translation_table : int Genetic code used to translate DNA sequences. Returns ------- protein_files : list List that contains the paths to the FASTA files with translated sequences. """ protein_files = [] for path in fasta_paths: records = import_sequences(path) translated_records = {seqid: str(sm.translate_dna(seq, translation_table, 0)[0][0]) for seqid, seq in records.items()} translated_lines = fasta_lines(list(translated_records.keys()), translated_records) basename = fo.file_basename(path).replace('.fasta', '_protein.fasta') prot_file = fo.join_paths(output_directory, [basename]) fo.write_lines(translated_lines, prot_file) protein_files.append(prot_file) return protein_files
def structure_stanford_output(input_file, output_file): sentences = [] sentence = [] for line in read_lines(input_file): if not line.startswith('Sentence'): sentence.append(line) #sentence.append(decode_dependency(line)) else: sentence = sentence[3:-1] #print(sentence) if len(sentence) == 0: continue #print(sentence[sentence.index('') + 2:]) #print(sentence[3:sentence.index('')]) #print(sentence[:sentence.index('')]) # sentences.append({ # 'tokens': ['ROOT'] + list(map(lambda token: token.split('PartOfSpeech=')[1].replace(']', ''), sentence[:sentence.index('')])), # 'dependencies': list(map(lambda i: decode_dependency(i, ['ROOT'] + list(map(lambda token: token.split('PartOfSpeech=')[1].replace(']', ''), sentence[:sentence.index('')]))), sentence[sentence.index('') + 2:])) # }) #print(list(map(lambda i: ['ROOT'] + list(map(lambda token: f"-{len(token.split('PartOfSpeech='))}- {token.split('PartOfSpeech=')}", sentence[:sentence.index('')])), sentence[sentence.index('') + 2:]))) sentences.append( list( map( lambda i: decode_dependency(i, ['ROOT'] + list( map( lambda token: (token.split('PartOfSpeech=')[1] if len(token.split('PartOfSpeech=')) >= 2 else DEFAULT_POS).replace(']', ''), sentence[:sentence.index('')]))), sentence[sentence.index('') + 2:]))) sentence = [] sentence = sentence[3:] sentences.append( list( map( lambda i: decode_dependency(i, ['ROOT'] + list( map( lambda token: token.split('PartOfSpeech=')[1].replace( ']', ''), sentence[:sentence.index('')]))), sentence[sentence.index('') + 2:]))) #print(sentences[-1]) #print(len(sentences)) #print(sentences[0]) dep_tree = [] phrase_types = ['-DOCSTART- -X- -X- O', ''] for sentence in sentences: root = ('ROOT', 0, 'ROOT') #print(f'({root[0]}//{root[1]}') make_dep_tree(root, sentence, 0, dep_tree) dep_tree.append('') for phrase_type in get_phrase_types(sentence): phrase_types.append(phrase_type) # Add empty line to split sentences phrase_types.append('') #print(dep_tree) #print(dep_tree) #print(list()) #print(len(phrase_types)) #write_lines("dependency_trees.txt", dep_tree) #read_lines(POS_FILE) write_lines(output_file, phrase_types)