Ejemplo n.º 1
0
def check_prodigal_results(prodigal_results, output_directory):
    """ Determines if Prodigal could not predict genes for any input
        assembly.

        Parameters
        ----------
        prodigal_results : list
            List with gene prediction results from Prodigal.
        output_directory : str
            Path to the output directory where the file with information
            about failed cases will be written to.

        Returns
        -------
        A list with the following elements:
            failed : list
                List with the stderr for the cases that Prodigal
                failed to predict genes for.
            failed_file : str
                Path to the file with information about the failed
                cases.
    """

    no_cds = [l for l in prodigal_results if l[1] == 0]
    errors = [l for l in prodigal_results if isinstance(l[1], str) is True]
    failed = no_cds + errors

    failed_file = os.path.join(output_directory, 'prodigal_stderr.tsv')
    if len(failed) > 0:
        lines = ['{0}\t{1}'.format(l[0], l[1]) for l in failed]
        fo.write_lines(lines, failed_file)

    return [failed, failed_file]
Ejemplo n.º 2
0
def unzip_file(compressed_file, archive_type='.gz'):
    """ Uncompresses a file.

        Parameters
        ----------
        compressed_file : str
            Path to the compressed file.
        archive_type : str
            Archive format.

        Returns
        -------
        uncompressed_file : str
            Path to the uncompressed file.
    """

    lines = []
    with gzip.open(compressed_file, 'rb') as f:
        for line in f:
            lines.append(line.decode())

    # save uncompressed contents
    uncompressed_file = compressed_file.rstrip('.gz')
    fo.write_lines(lines, uncompressed_file, joiner='')

    return uncompressed_file
Ejemplo n.º 3
0
def save_extracted_cds(genome, identifier, orf_file, protein_table, cds_file):
    """ Extracts coding sequences from a genome assembly based
        on Prodigal's gene predictions. Writes coding sequences
        to a FASTA file and information about coding sequences to
        a TSV file.

        Parameters
        ----------
        genome : str
            Path to the FASTA file with the FASTA sequences for
            a genome.
        identifier : str
            Genome identifier to add to FASTA records headers
            and to the first field in the TSV file.
        orf_file : str
            Path to the file with Prodigal results.
        protein_table : str
            Path to the TSV file to which coding sequences
            information will be written.
        cds_file : str
            Path to the FASTA file to which coding sequences
            will be written.

        Returns
        -------
        total_cds : int
            Total number of coding sequences extracted from
            the genome.
    """

    # import contigs for current genome/assembly
    contigs = fao.import_sequences(genome)
    # extract coding sequences from contigs
    reading_frames = fo.pickle_loader(orf_file)
    genome_info = extract_genome_cds(reading_frames, contigs, 1)
    # save coding sequences to file
    # create records and write them to file
    cds_lines = fao.create_fasta_lines(genome_info[0], identifier)
    fo.write_lines(cds_lines, cds_file)

    write_protein_table(protein_table, identifier, genome_info[1])

    total_cds = len(genome_info[0])

    return total_cds
Ejemplo n.º 4
0
def integer_headers(input_fasta, output_fasta, start=1, limit=5000):
    """ Switches FASTA records headers in a file by integer
        values.

        Parameters
        ----------
        input_fasta : str
            Path to the a FASTA file.
        output_fasta : str
            Path to the output file with modified headers.
        start : int
            Integer value of first identifier.
        limit : int
            Maximum number of FASTA records to keep in
            memory.

        Returns
        -------
        ids_map : dict
            Dictionary with mapping between integer and original
            headers.
    """

    seqs = []
    ids_map = {}
    exausted = False
    seq_generator = SeqIO.parse(input_fasta, 'fasta')
    while exausted is False:
        record = next(seq_generator, None)
        if record is not None:
            new_id = 'seq_{0}'.format(start)
            ids_map[new_id] = record.id
            sequence = str(record.seq)
            new_rec = '>{0}\n{1}'.format(new_id, sequence)
            seqs.append(new_rec)
            start += 1
        elif record is None:
            exausted = True

        if len(seqs) == limit or exausted is True:
            fo.write_lines(seqs, output_fasta)
            seqs = []

    return ids_map
Ejemplo n.º 5
0
def predict(model, subset_for_prediction, targets, filename):
    predictor = Predictor(model)
    predictions = predictor.predict(subset_for_prediction)['pred']
    words = list(subset_for_prediction.get_field('raw_words'))
    lines = []
    # print(predictions)
    # print(f'predicted labels for {len(predictions)}/{len(words)} items')

    words_sequence_index = 1
    labels_sequence_index = 0
    for sentence in list(zip(predictions, words)):
      if type(sentence[labels_sequence_index][0]) == int:
        continue
      words = sentence[words_sequence_index]
      print(sentence[labels_sequence_index])
      labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0])
      for pair in zip(words, labels):
        lines.append('\t'.join(pair))
      lines.append('')
    write_lines(filename, lines)
Ejemplo n.º 6
0
    def _predict(self, subset_for_prediction, targets, filename):
        predictor = Predictor(self)
        predictions = predictor.predict(subset_for_prediction)['pred']
        words = list(subset_for_prediction.get_field('raw_words'))
        lines = []

        words_sequence_index = 1
        labels_sequence_index = 0
        for sentence in list(zip(predictions, words)):
            if type(sentence[labels_sequence_index][0]) == int:
                continue
            words = sentence[words_sequence_index]
            #print(sentence[labels_sequence_index])
            #labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0])
            labels = map(lambda label: f'{targets.to_word(label)}',
                         sentence[labels_sequence_index][0])
            for pair in zip(words, labels):
                lines.append(' '.join(pair))
            lines.append('')
        if filename is not None:
            write_lines(filename, lines)
        return lines
Ejemplo n.º 7
0
def translate_fastas(fasta_paths, output_directory, translation_table):
    """ Translates DNA sequences in a set of FASTA files.

        Parameters
        ----------
        fasta_paths : list
            List with the paths to the FASTA files that contain
            the DNA sequences to translate.
        output_directory : str
            Path to the output directory where FASTA files with
            protein sequences will be writen to.
        translation_table : int
            Genetic code used to translate DNA sequences.

        Returns
        -------
        protein_files : list
            List that contains the paths to the FASTA files with
            translated sequences.
    """

    protein_files = []
    for path in fasta_paths:
        records = import_sequences(path)
        translated_records = {seqid: str(sm.translate_dna(seq, translation_table, 0)[0][0])
                              for seqid, seq in records.items()}
        translated_lines = fasta_lines(list(translated_records.keys()),
                                       translated_records)

        basename = fo.file_basename(path).replace('.fasta', '_protein.fasta')
        prot_file = fo.join_paths(output_directory, [basename])

        fo.write_lines(translated_lines, prot_file)
        protein_files.append(prot_file)

    return protein_files
Ejemplo n.º 8
0
def structure_stanford_output(input_file, output_file):
    sentences = []
    sentence = []
    for line in read_lines(input_file):
        if not line.startswith('Sentence'):
            sentence.append(line)
            #sentence.append(decode_dependency(line))
        else:
            sentence = sentence[3:-1]
            #print(sentence)
            if len(sentence) == 0:
                continue
            #print(sentence[sentence.index('') + 2:])
            #print(sentence[3:sentence.index('')])
            #print(sentence[:sentence.index('')])
            # sentences.append({
            # 	'tokens': ['ROOT'] + list(map(lambda token: token.split('PartOfSpeech=')[1].replace(']', ''), sentence[:sentence.index('')])),
            # 	'dependencies': list(map(lambda i: decode_dependency(i, ['ROOT'] + list(map(lambda token: token.split('PartOfSpeech=')[1].replace(']', ''), sentence[:sentence.index('')]))), sentence[sentence.index('') + 2:]))
            # })
            #print(list(map(lambda i: ['ROOT'] + list(map(lambda token: f"-{len(token.split('PartOfSpeech='))}- {token.split('PartOfSpeech=')}", sentence[:sentence.index('')])), sentence[sentence.index('') + 2:])))
            sentences.append(
                list(
                    map(
                        lambda i: decode_dependency(i, ['ROOT'] + list(
                            map(
                                lambda token:
                                (token.split('PartOfSpeech=')[1]
                                 if len(token.split('PartOfSpeech=')) >= 2 else
                                 DEFAULT_POS).replace(']', ''),
                                sentence[:sentence.index('')]))),
                        sentence[sentence.index('') + 2:])))
            sentence = []
    sentence = sentence[3:]
    sentences.append(
        list(
            map(
                lambda i: decode_dependency(i, ['ROOT'] + list(
                    map(
                        lambda token: token.split('PartOfSpeech=')[1].replace(
                            ']', ''), sentence[:sentence.index('')]))),
                sentence[sentence.index('') + 2:])))
    #print(sentences[-1])
    #print(len(sentences))
    #print(sentences[0])

    dep_tree = []
    phrase_types = ['-DOCSTART- -X- -X- O', '']
    for sentence in sentences:
        root = ('ROOT', 0, 'ROOT')
        #print(f'({root[0]}//{root[1]}')
        make_dep_tree(root, sentence, 0, dep_tree)
        dep_tree.append('')
        for phrase_type in get_phrase_types(sentence):
            phrase_types.append(phrase_type)
        # Add empty line to split sentences
        phrase_types.append('')
        #print(dep_tree)
    #print(dep_tree)
    #print(list())
    #print(len(phrase_types))
    #write_lines("dependency_trees.txt", dep_tree)
    #read_lines(POS_FILE)

    write_lines(output_file, phrase_types)