def standard_test ( self, informat, outformat, params ) :
        """
        Standard testing procedure used by all tests.

        Arguments :
            informat  ( string )
                Input file format.
            outformat  ( string )
                Output file format.
            params  ( string )
                Arguments passed to the alignment tool.
        """
        infile = '{}/f001.{}'.format(informat.capitalize(), informat)
        outfile = 'tmp_test.aln'
        self.add_file_to_clean(outfile)
        # Check the input
        self.assertTrue(os.path.isfile(infile))
        self.assertEqual(len(list(SeqIO.parse(infile, informat))), 50)
        # Generate the alignment
        Align.get_alignment(muscle_exe, infile, informat, args=params,
                            outfile=outfile, outfile_format=outformat)
        # Check the output
        self.assertTrue(os.path.isfile(outfile))
        out_align = SeqIO.to_dict(SeqIO.parse(outfile, outformat))
        prevfile = '{}/f001.muscle_{}.aln'.format(outformat.capitalize(),
                                                  params)
        self.assertTrue(os.path.isfile(prevfile))
        prev_align = SeqIO.to_dict(SeqIO.parse(prevfile, outformat))
        self.assertEqual(len(viewkeys(out_align)), len(viewkeys(prev_align)))
        for key, value in viewitems(out_align) :
            self.assertEqual(str(value.seq), str(prev_align[key].seq))
    def standard_test(self, informat, outformat, params):
        """
        Standard testing procedure used by all tests.

        Arguments :
            informat  ( string )
                Input file format.
            outformat  ( string )
                Output file format.
            params  ( string )
                Arguments passed to the alignment tool.
        """
        infile = '{}/f001.{}'.format(informat.capitalize(), informat)
        outfile = 'tmp_test.aln'
        self.add_file_to_clean(outfile)
        # Check the input
        self.assertTrue(os.path.isfile(infile))
        self.assertEqual(len(list(SeqIO.parse(infile, informat))), 50)
        # Generate the alignment
        Align.get_alignment(mafft_exe,
                            infile,
                            informat,
                            args=params,
                            outfile=outfile,
                            outfile_format=outformat)
        # Check the output
        self.assertTrue(os.path.isfile(outfile))
        out_align = SeqIO.to_dict(SeqIO.parse(outfile, outformat))
        prevfile = '{}/f001.mafft_{}.aln'.format(outformat.capitalize(),
                                                 params)
        self.assertTrue(os.path.isfile(prevfile))
        prev_align = SeqIO.to_dict(SeqIO.parse(prevfile, outformat))
        self.assertEqual(len(viewkeys(out_align)), len(viewkeys(prev_align)))
        for key, value in viewitems(out_align):
            self.assertEqual(str(value.seq), str(prev_align[key].seq))
Example #3
0
def get_keywords ( tool ) :
    """
    Arguments :
        tool  ( string )
            Name of the phylogenetic inference or bootstrapping tool.

    Returns :
        dict
            Dictionary containing the keywords and their corresponding
            arguments.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.Inference.
    """
    tool = tool.lower()
    tool_lib_keys = viewkeys(_PHYLO_TOOL_TO_LIB) | viewkeys(_BOOTS_TOOL_TO_LIB)
    if ( tool not in tool_lib_keys ) :
        raise ValueError('The tool "{}" isn\'t included in ' \
                         'MEvoLib.Inference'.format(tool))
    # else : # tool in tool_lib_keys
    keyword_dict = {}
    if ( tool in _PHYLO_TOOL_TO_LIB ) :
        tool_lib_dict = _PHYLO_TOOL_TO_LIB
    else : # tool in _BOOTS_TOOL_TO_LIB
        tool_lib_dict = _BOOTS_TOOL_TO_LIB
    for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)) :
        keyword_dict[key] = ' '.join(value)
    return ( keyword_dict )
Example #4
0
def get_tools ( ) :
    """
    Returns :
        dict
            Dictionary of supertree and consensus tree tools included in the
            current version of MEvoLib.
    """
    return ( dict([('supertree', list(viewkeys(_STREE_TOOL_TO_LIB))),
                   ('consensus', list(viewkeys(_CONS_TOOL_TO_LIB)))]) )
Example #5
0
def get_tools ( ) :
    """
    Returns :
        dict
            Dictionary of phylogenetic inference and bootstrapping software
            tools included in the current version of MEvoLib.
    """
    return ( dict([('inference', list(viewkeys(_PHYLO_TOOL_TO_LIB))),
                   ('bootstrap', list(viewkeys(_BOOTS_TOOL_TO_LIB)))]) )
Example #6
0
def get_tools():
    """
    Returns :
        dict
            Dictionary of supertree and consensus tree tools included in the
            current version of MEvoLib.
    """
    return (dict([('supertree', list(viewkeys(_STREE_TOOL_TO_LIB))),
                  ('consensus', list(viewkeys(_CONS_TOOL_TO_LIB)))]))
 def test_simple_alignment(self):
     """
     Test of the alignment method for all the available configurations with
     supported input and output formats.
     """
     for keyword in viewkeys(Align.get_keywords(mafft_exe)):
         self.standard_test('fasta', 'fasta', keyword)
Example #8
0
def get_keywords(tool):
    """
    Arguments :
        tool  ( string )
            Name of the supertree or consensus tool.

    Returns :
        dict
            Dictionary containing the keywords and their corresponding
            arguments.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.PhyloAssemble.
    """
    tool = tool.lower()
    #tool_lib_keys = viewkeys(_STREE_TOOL_TO_LIB) | viewkeys(_CONS_TOOL_TO_LIB)
    tool_lib_keys = viewkeys(_CONS_TOOL_TO_LIB)
    if (tool not in tool_lib_keys):
        message = 'The tool "{}" isn\'t included in ' \
                  'MEvoLib.PhyloAssemble'.format(tool)
        raise ValueError(message)
    # else : # tool in tool_lib_keys
    keyword_dict = {}
    #    if ( tool in _STREE_TOOL_TO_LIB ) :
    #        tool_lib_dict = _STREE_TOOL_TO_LIB
    #    else : # tool in _CONS_TOOL_TO_LIB
    if (tool in _CONS_TOOL_TO_LIB):
        tool_lib_dict = _CONS_TOOL_TO_LIB
    for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)):
        keyword_dict[key] = ' '.join(value)
    return (keyword_dict)
Example #9
0
 def test_simple_phylo_inference ( self ) :
     """
     Test of the phylogenetic inference method for all the available
     configurations with supported input and output formats.
     """
     for keyword in viewkeys(Inference.get_keywords(raxml_exe)) :
         self.standard_test('fasta', 'newick', keyword)
 def test_simple_phylo_assembly ( self ) :
     """
     Test of the consensus tree method for all the available configurations
     with supported input and output formats.
     """
     for keyword in viewkeys(PhyloAssemble.get_keywords(consense_exe)) :
         self.standard_test('newick', 'newick', keyword)
 def test_simple_alignment ( self ) :
     """
     Test of the alignment method for all the available configurations with
     supported input and output formats.
     """
     for keyword in viewkeys(Align.get_keywords(muscle_exe)) :
         self.standard_test('fasta', 'fasta', keyword)
Example #12
0
def get_keywords ( tool ) :
    """
    Arguments :
        tool  ( string )
            Name of the supertree or consensus tool.

    Returns :
        dict
            Dictionary containing the keywords and their corresponding
            arguments.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.PhyloAssemble.
    """
    tool = tool.lower()
    #tool_lib_keys = viewkeys(_STREE_TOOL_TO_LIB) | viewkeys(_CONS_TOOL_TO_LIB)
    tool_lib_keys = viewkeys(_CONS_TOOL_TO_LIB)
    if ( tool not in tool_lib_keys ) :
        message = 'The tool "{}" isn\'t included in ' \
                  'MEvoLib.PhyloAssemble'.format(tool)
        raise ValueError(message)
    # else : # tool in tool_lib_keys
    keyword_dict = {}
#    if ( tool in _STREE_TOOL_TO_LIB ) :
#        tool_lib_dict = _STREE_TOOL_TO_LIB
#    else : # tool in _CONS_TOOL_TO_LIB
    if ( tool in _CONS_TOOL_TO_LIB ) :
        tool_lib_dict = _CONS_TOOL_TO_LIB
    for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)) :
        keyword_dict[key] = ' '.join(value)
    return ( keyword_dict )
Example #13
0
def get_features():
    """
    Returns :
        list
            List of all possible feature keywords that can be found in any
            GenBank's sequence record.
    """
    return ([x for x in iter(viewkeys(_FEAT_QUAL_DICT))])
Example #14
0
def get_tools():
    """
    Returns :
        list
            List of clustering methods and software tools included in the
            current version of MEvoLib.
    """
    return (list(viewkeys(_METHOD_TO_FUNC)))
Example #15
0
def get_tools ( ) :
    """
    Returns :
        list
            List of clustering methods and software tools included in the
            current version of MEvoLib.
    """
    return ( list(viewkeys(_METHOD_TO_FUNC)) )
Example #16
0
def get_tools():
    """
    Returns :
        list
            List of alignment software tools included in the current version of
            MEvoLib.
    """
    return (list(viewkeys(_TOOL_TO_LIB)))
Example #17
0
def get_features ( ) :
    """
    Returns :
        list
            List of all possible feature keywords that can be found in any
            GenBank's sequence record.
    """
    return ( [x  for x in iter(viewkeys(_FEAT_QUAL_DICT))] )
Example #18
0
def get_tools ( ) :
    """
    Returns :
        list
            List of alignment software tools included in the current version of
            MEvoLib.
    """
    return ( list(viewkeys(_TOOL_TO_LIB)) )
Example #19
0
    def update(self, email):
        """
        Update the BioSeqs object from the last NCBI's Entrez database and query
        values stored in the report list. All the sequences stored must have
        their genbank identifier information in the annotations property. The
        deleted sequences from the database will be deleted in the object and
        the new sequences will be fetched and stored.

        Arguments:
            email  (string)
                E-mail required by Bio.Entrez.

        Raises:
            ValueError
                If there is no entrez entry in the report list.
            ValueError
                If any sequence hasn't its GenBank identifier information in
                the annotations property.

        * The e-mail information is considered sensible information and it won't
        be saved in any public or private variable of the object.
        """
        # Get the last entrez entry of the report
        for record in reversed(self._report):
            if (record[1] == 'entrez'):
                date_time, src_type, entrez_db, query = record
                break
        else:
            message = 'No entrez entry found in object\'s report'
            raise ValueError(message)
        # Perform the update process in a copy of the dictionary to avoid
        # incomplete updates due to unexpected HTTP exceptions
        seq_dict = copy.copy(self.data)
        Entrez.email = email
        db_rettype = _get_entrez_db_rettype(entrez_db)
        # Execute Entrez.esearch() to get the total number of sequences that
        # matches the query in the Entrez database
        handle = Entrez.esearch(db=entrez_db, term=query, rettype='count')
        num_seqs = int(Entrez.read(handle)['Count'])
        handle.close()
        if (num_seqs == 0):
            warnings.warn('The query stored didn\'t return any sequence')
        else:
            # Execute again Entrez.esearch() giving the total number of
            # sequences to get the complete list of Entrez database's sequence
            # identifiers
            updated_seq_ids = set()
            for index in range(0, num_seqs, MAX_NUM_SEQS):
                handle = Entrez.esearch(db=entrez_db,
                                        term=query,
                                        restart=index,
                                        retmax=num_seqs)
                record = Entrez.read(handle)
                handle.close()
                updated_seq_ids.update(record['IdList'])
            # Get an "entrez identifier: accession" dictionary of the stored
            # sequences
            gi_acc_dict = {}
            try:
                for key, value in viewitems(seq_dict):
                    gi_acc_dict[value.annotations['gi']] = key
            except KeyError:
                message = 'Missing genbank identifier'
                raise ValueError(message)
            else:
                # Use that dictionary to check which of the stored identifiers
                # have been removed from the Entrez database
                deprecated_seq_ids = viewkeys(gi_acc_dict)
                ids_to_remove = deprecated_seq_ids - updated_seq_ids
                # Remove all the deprecated sequences
                for gi_value in ids_to_remove:
                    accession = gi_acc_dict[gi_value]
                    del seq_dict[accession]
                    del gi_acc_dict[gi_value]
                # Finally, get the list of new identifiers to fetch
                ids_to_fetch = list(
                    updated_seq_ids.difference(deprecated_seq_ids))
                num_to_fetch = len(ids_to_fetch)
                if (num_to_fetch > 0):
                    # Fetch the first sequence and estimate the batch size
                    fetch_handle = Entrez.efetch(db=entrez_db,
                                                 id=ids_to_fetch[0],
                                                 retmode='text',
                                                 rettype=db_rettype)
                    record_str = fetch_handle.read()
                    fetch_handle.close()
                    record = SeqIO.read(StringIO(record_str), 'genbank')
                    seq_dict[record.id] = record
                    batch_size = _estimate_batch_size(record_str)
                    # In batches of 'batch_size', fetch the Entrez database
                    # information of each new sequence in text format through
                    # Entrez.efetch()
                    start = 1
                    exceptRaised = False
                    while (start < num_to_fetch):
                        end = start + batch_size
                        try:
                            fetch_handle = Entrez.efetch(
                                db=entrez_db,
                                id=ids_to_fetch[start:end],
                                retmode='text',
                                rettype=db_rettype)
                        except:
                            # If it is the first time for this batch,
                            # wait for a minute to see if we can recover
                            # from the exception
                            if (not exceptRaised):
                                warnings.warn(
                                    ("Exception raised durig fetching"
                                     ". Trying to recover..."))
                                exceptRaised = True
                                sleep(60)
                            else:
                                warnings.warn(("Exception raised for second "
                                               "time. Saving current progress "
                                               "and exiting."))
                                break
                        else:
                            exceptRaised = False
                            for record in SeqIO.parse(fetch_handle, 'genbank'):
                                seq_dict[record.id] = record
                            fetch_handle.close()
                            start += batch_size
                    # The process has ended correctly so we can replace the old
                    # dictionary with the new one
                    self.data = seq_dict
                    # Generate the corresponding report tuple
                    date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
                    self._report.append(
                        (date_time, 'entrez', entrez_db, query))
Example #20
0
    def update(self, email):
        """
        Update the BioSeqs object from the last NCBI's Entrez database and query
        values stored in the report list. All the sequences stored must have
        their genbank identifier information in the annotations property. The
        deleted sequences from the database will be deleted in the object and
        the new sequences will be fetched and stored.
        
        Arguments :
            email  ( string )
                E-mail required by Bio.Entrez.

        Raises :
            ValueError
                If there is no entrez entry in the report list.
            ValueError
                If any sequence hasn't its GenBank identifier information in
                the annotations property.

        * The e-mail information is considered sensible information and it won't
        be saved in any public or private variable of the object.
        """
        # Get the last entrez entry of the report
        for record in reversed(self._report):
            if record[1] == "entrez":
                date_time, src_type, entrez_db, query = record
                break
        else:
            message = "No entrez entry found in object's report"
            raise ValueError(message)
        # Perform the update process in a copy of the dictionary to avoid
        # incomplete updates due to unexpected HTTP exceptions
        seq_dict = copy.copy(self.data)
        Entrez.email = email
        db_rettype = _get_entrez_db_rettype(entrez_db)
        # Execute Entrez.esearch() to get the total number of sequences that
        # matches the query in the Entrez database
        handle = Entrez.esearch(db=entrez_db, term=query, rettype="count")
        num_seqs = int(Entrez.read(handle)["Count"])
        handle.close()
        if num_seqs == 0:
            warnings.warn("The query stored didn't return any sequence")
        else:
            # Execute again Entrez.esearch() giving the total number of
            # sequences to get the complete list of Entrez database's sequence
            # identifiers
            updated_seq_ids = set()
            for index in range(0, num_seqs, 100000):
                handle = Entrez.esearch(db=entrez_db, term=query, restart=index, retmax=num_seqs)
                record = Entrez.read(handle)
                handle.close()
                updated_seq_ids.update(record["IdList"])
            # Get an "entrez identifier: accession" dictionary of the stored
            # sequences
            gi_acc_dict = {}
            try:
                for key, value in viewitems(seq_dict):
                    gi_acc_dict[value.annotations["gi"]] = key
            except KeyError:
                message = "Missing genbank identifier"
                raise ValueError(message)
            else:
                # Use that dictionary to check which of the stored identifiers
                # have been removed from the Entrez database
                deprecated_seq_ids = viewkeys(gi_acc_dict)
                ids_to_remove = deprecated_seq_ids - updated_seq_ids
                # Remove all the deprecated sequences
                for gi_value in ids_to_remove:
                    accession = gi_acc_dict[gi_value]
                    del seq_dict[accession]
                    del gi_acc_dict[gi_value]
                # Finally, get the list of new identifiers to fetch
                ids_to_fetch = list(updated_seq_ids.difference(deprecated_seq_ids))
                num_to_fetch = len(ids_to_fetch)
                if num_to_fetch > 0:
                    # Fetch the first sequence and estimate the batch size
                    fetch_handle = Entrez.efetch(db=entrez_db, id=ids_to_fetch[0], retmode="text", rettype=db_rettype)
                    record_str = fetch_handle.read()
                    fetch_handle.close()
                    record = SeqIO.read(StringIO(record_str), "genbank")
                    seq_dict[record.id] = record
                    batch_size = _estimate_batch_size(record_str)
                    # In batches of 'batch_size', fetch the Entrez database
                    # information of each new sequence in text format through
                    # Entrez.efetch()
                    for start in range(1, num_to_fetch, batch_size):
                        end = start + batch_size
                        fetch_handle = Entrez.efetch(
                            db=entrez_db, id=ids_to_fetch[start:end], retmode="text", rettype=db_rettype
                        )
                        for record in SeqIO.parse(fetch_handle, "genbank"):
                            seq_dict[record.id] = record
                        fetch_handle.close()
                    # The process has ended correctly so we can replace the old
                    # dictionary with the new one
                    self.data = seq_dict
                    # Generate the corresponding report tuple
                    date_time = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
                    self._report.append((date_time, "entrez", entrez_db, query))
Example #21
0
def map_seqs(record_list,
             feature_filter=None,
             ref_seq=None,
             alignment_bin=None,
             log_file=None):
    """
    Gene splicing of the sequences at 'record_list'. By default, the gene
    location is extracted from the feature list of each sequence. If there is no
    list, that sequence is classified as "unprocessable" or, if a reference
    sequence is given, the reference features are used to extract the different
    genes (through a normalization process using an alignment tool). All the
    features are returned unless a list of feature keywords are passed through
    'feature_filter' parameter. If a log file path is given and any file exists
    with that name, the file will be overwritten without any warning.

    Arguments :
        record_list  ( list )
            List of SeqRecord objects (from Biopython).
        feature_filter  ( Optional[list] )
            List of feature keywords the user wants to be returned (from all the
            possible ones).
        ref_seq  ( Optional[string] )
            Keyword (from MEvoLib.Data) or file path (GENBANK format) of the
            reference sequence.
        alignment_bin  ( Optional[string] )
            Binary path of the alignment tool (only required if a reference
            sequence is passed).
        log_file  ( Optional[string] )
            Absolute path for the log file.

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequence fragments as values in lists of SeqRecord objects.

    Raises :
        IOError
            If the reference sequence's file path doesn't exist.
        RuntimeError
            If the call to the alignment tool command raises an exception.

    * Reference sequence's file must be in GENBANK format.
    """
    # Load the desired feature keywords as keys of the gene dictionary and a
    # term dictionary with a list of sequences for each qualifier of any
    # selected feature
    if (feature_filter):
        gene_dict = dict((key, {}) for key in feature_filter)
        term_dict = dict((key, {}) for key in feature_filter)
    else:  # feature_filter is None
        gene_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT))
        term_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT))
    # Get the reference sequence's SeqRecord object or create an unprocessable
    # list for those sequences without gene information
    if (ref_seq in _REF_SEQ_DICT):
        refseq_record = _REF_SEQ_DICT[ref_seq].RECORD
    elif (ref_seq):  # ref_seq != None
        refseq_record = SeqIO.read(ref_seq, 'gb')
    else:  # ref_seq is None
        unprocessable = []
    num_seqs = 0
    # Iterate over all the records to get their gene division
    for record in record_list:
        num_seqs += 1
        if (len(record.features) <= 1):
            # GenBank's "source" feature key is mandatory
            if (ref_seq):
                record.seq, record.features = _normalization(
                    record, refseq_record, alignment_bin)
            else:  # ref_seq is None
                unprocessable.append(record)
                continue
        # else : # len(record.features) > 1
        record_features = (feat for feat in record.features[1:]
                           if feat.type in gene_dict)
        for feature in record_features:
            # Create a set of qualifiers of the record from the main fields of
            # GenBank (pre-saved in _FEAT_QUAL_DICT)
            record_qualifiers = set()
            for qualifier_key in iter(_FEAT_QUAL_DICT[feature.type]):
                if (qualifier_key in feature.qualifiers):
                    record_qualifiers.update(
                        (_string_filter(x)
                         for x in feature.qualifiers[qualifier_key]))
            if (not record_qualifiers):
                # 'record_qualifiers' is empty
                record_qualifiers.add(feature.type)
            # Generate a string of the qualifiers' set to store it as a
            # description of the gene SeqRecord object
            qualifier_id = ':'.join(
                sorted(record_qualifiers, key=lambda item: (len(item), item)))
            feature_record = SeqRecord(feature.extract(record).seq,
                                       id=record.id,
                                       name=record.id,
                                       description=qualifier_id)
            # Add new terms to the corresponding entry of the dictionary for
            # the given feature, or add the sequence record id to the existing
            # entry
            for pair in itertools.combinations(qualifier_id.split(':'), 2):
                if (pair not in term_dict[feature.type]):
                    term_dict[feature.type][pair] = set([record.id])
                else:  # pair in term_dict[feature.type]
                    term_dict[feature.type][pair].add(record.id)
            # Merge possible matching qualifiers for the same type of feature
            qualifiers_to_merge = []
            for key in viewkeys(gene_dict[feature.type]):
                key_set = set(key.split(':'))
                if (not record_qualifiers.isdisjoint(key_set)):
                    if (record_qualifiers <= key_set):
                        record_qualifiers.update(key_set)
                    elif (record_qualifiers > key_set):
                        qualifiers_to_merge.append(key)
                    else:
                        # 'record_qualifiers' and 'key_set' differ but their
                        # intersection is not empty
                        record_qualifiers.update(key_set)
                        qualifiers_to_merge.append(key)
            # Generate new qualifier string
            qualifier_id = ':'.join(
                sorted(record_qualifiers, key=lambda item: (len(item), item)))
            # Add the new gene SeqRecord object to the dictionary
            if (qualifier_id not in gene_dict[feature.type]):
                gene_dict[feature.type][qualifier_id] = [feature_record]
            else:  # qualifier_id in gene_dict[feature.type]
                gene_dict[feature.type][qualifier_id].append(feature_record)
            # Merge those qualifiers that belong to the same gene
            for qualifier_key in qualifiers_to_merge:
                if (qualifier_key != qualifier_id):
                    gene_dict[feature.type][qualifier_id].extend(
                        gene_dict[feature.type][qualifier_key])
                    del gene_dict[feature.type][qualifier_key]
    # The error calculation has been extracted from the following sampling
    # statistics equation:
    #
    #                   N * Z^2 * p * (1-p)
    #         n = -------------------------------
    #              (N-1) * e^2 + Z^2 * p * (1-p)
    #
    # where N is the number of sequences, n is the minimum sampling size
    # (threshold), e is the error fixed to 0,01, Z is fixed to get a 0,99
    # confidence interval and p is assumed to be 0,5.
    e_value = 0.01
    z_value = 2.58
    p_value = 0.5
    coef = math.pow(z_value, 2) * p_value * (1.0 - p_value)
    threshold = math.ceil((num_seqs * coef) / \
                          ((num_seqs - 1.0) * math.pow(e_value, 2) + coef))
    # If no log file path is provided, save log content in a named temporary
    # file that won't be deleted after the function ends
    if (not log_file):
        log_file = (tempfile.NamedTemporaryFile(delete=False)).name
    # Clean-up empty features and merge qualifiers dict keys with features dict
    # keys to get a {str: list} dict for all the genes
    set_dict = {}
    with open(log_file, 'w') as log:
        for feat_key, feat_value in iter(viewitems(gene_dict)):
            if (feat_value):
                log.write('> {}\n'.format(feat_key))
                for qual_key, qual_value in iter(viewitems(feat_value)):
                    # Generation of the content of the set dictionary that will
                    # be returned
                    new_key = '{}.{}'.format(feat_key, qual_key.split(':')[0])
                    set_dict.setdefault(new_key, []).extend(qual_value)
                    # For every existing pair of qualifiers, if the number of
                    # records that hold both is below the calculated threshold,
                    # it might be the result of a typo in those records'
                    # information (further review of the log file is advisable)
                    for pair in itertools.combinations(qual_key.split(':'), 2):
                        if (pair in term_dict[feat_key]):
                            sampling_size = len(term_dict[feat_key][pair])
                            if (sampling_size < threshold):
                                seq_list = list(term_dict[feat_key][pair])
                                text = '\t{}\n'.format(' || '.join(pair))
                                for i in range(0, sampling_size // 6 + 1):
                                    text += '\t\t{}\n'.format(' '.join(
                                        seq_list[i * 6:(i + 1) * 6]))
                                if ((sampling_size % 6) != 0):
                                    text += '\n'
                                log.write(text)
                log.write('\n')
    # If no reference sequence has been introduced, include in the gene dict
    # those sequences that couldn't be processed due to lack of information
    if (not ref_seq):
        set_dict['unprocessable'] = unprocessable
    return (set_dict)
Example #22
0
def map_seqs ( record_list, feature_filter = None, ref_seq = None,
               alignment_bin = None, log_file = None ) :
    """
    Gene splicing of the sequences at 'record_list'. By default, the gene
    location is extracted from the feature list of each sequence. If there is no
    list, that sequence is classified as "unprocessable" or, if a reference
    sequence is given, the reference features are used to extract the different
    genes (through a normalization process using an alignment tool). All the
    features are returned unless a list of feature keywords are passed through
    'feature_filter' parameter. If a log file path is given and any file exists
    with that name, the file will be overwritten without any warning.

    Arguments :
        record_list  ( list )
            List of SeqRecord objects (from Biopython).
        feature_filter  ( Optional[list] )
            List of feature keywords the user wants to be returned (from all the
            possible ones).
        ref_seq  ( Optional[string] )
            Keyword (from MEvoLib.Data) or file path (GENBANK format) of the
            reference sequence.
        alignment_bin  ( Optional[string] )
            Binary path of the alignment tool (only required if a reference
            sequence is passed).
        log_file  ( Optional[string] )
            Absolute path for the log file.

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequence fragments as values in lists of SeqRecord objects.

    Raises :
        IOError
            If the reference sequence's file path doesn't exist.
        RuntimeError
            If the call to the alignment tool command raises an exception.

    * Reference sequence's file must be in GENBANK format.
    """
    # Load the desired feature keywords as keys of the gene dictionary and a
    # term dictionary with a list of sequences for each qualifier of any
    # selected feature
    if ( feature_filter ) :
        gene_dict = dict((key, {})  for key in feature_filter)
        term_dict = dict((key, {})  for key in feature_filter)
    else : # feature_filter is None
        gene_dict = dict((key, {})  for key in viewkeys(_FEAT_QUAL_DICT))
        term_dict = dict((key, {})  for key in viewkeys(_FEAT_QUAL_DICT))
    # Get the reference sequence's SeqRecord object or create an unprocessable
    # list for those sequences without gene information
    if ( ref_seq in _REF_SEQ_DICT ) :
        refseq_record = _REF_SEQ_DICT[ref_seq].RECORD
    elif ( ref_seq ) : # ref_seq != None
        refseq_record = SeqIO.read(ref_seq, 'gb')
    else : # ref_seq is None
        unprocessable = []
    num_seqs = 0
    # Iterate over all the records to get their gene division
    for record in record_list :
        num_seqs += 1
        if ( len(record.features) <= 1 ) :
            # GenBank's "source" feature key is mandatory
            if ( ref_seq ) :
                record.seq, record.features = _normalization(record,
                                                             refseq_record,
                                                             alignment_bin)
            else : # ref_seq is None
                unprocessable.append(record)
                continue
        # else : # len(record.features) > 1
        record_features = (feat  for feat in record.features[1:]
                                     if feat.type in gene_dict)
        for feature in record_features :
            # Create a set of qualifiers of the record from the main fields of
            # GenBank (pre-saved in _FEAT_QUAL_DICT)
            record_qualifiers = set()
            for qualifier_key in iter(_FEAT_QUAL_DICT[feature.type]) :
                if ( qualifier_key in feature.qualifiers ) :
                    record_qualifiers.update((_string_filter(x)  for x in
                                             feature.qualifiers[qualifier_key]))
            if ( not record_qualifiers ) :
                # 'record_qualifiers' is empty
                record_qualifiers.add(feature.type)
            # Generate a string of the qualifiers' set to store it as a
            # description of the gene SeqRecord object
            qualifier_id = ':'.join(sorted(record_qualifiers,
                                           key=lambda item: (len(item), item)))
            feature_record = SeqRecord(feature.extract(record).seq,
                                       id=record.id, name=record.id,
                                       description=qualifier_id)
            # Add new terms to the corresponding entry of the dictionary for
            # the given feature, or add the sequence record id to the existing
            # entry
            for pair in itertools.combinations(qualifier_id.split(':'), 2) :
                if ( pair not in term_dict[feature.type] ) :
                    term_dict[feature.type][pair] = set([record.id])
                else : # pair in term_dict[feature.type]
                    term_dict[feature.type][pair].add(record.id)
            # Merge possible matching qualifiers for the same type of feature
            qualifiers_to_merge = []
            for key in viewkeys(gene_dict[feature.type]) :
                key_set = set(key.split(':'))
                if ( not record_qualifiers.isdisjoint(key_set) ) :
                    if ( record_qualifiers <= key_set ) :
                        record_qualifiers.update(key_set)
                    elif ( record_qualifiers > key_set ) :
                        qualifiers_to_merge.append(key)
                    else :
                        # 'record_qualifiers' and 'key_set' differ but their
                        # intersection is not empty
                        record_qualifiers.update(key_set)
                        qualifiers_to_merge.append(key)
            # Generate new qualifier string
            qualifier_id = ':'.join(sorted(record_qualifiers,
                                           key=lambda item: (len(item), item)))
            # Add the new gene SeqRecord object to the dictionary
            if ( qualifier_id not in gene_dict[feature.type] ) :
                gene_dict[feature.type][qualifier_id] = [feature_record]
            else : # qualifier_id in gene_dict[feature.type]
                gene_dict[feature.type][qualifier_id].append(feature_record)
            # Merge those qualifiers that belong to the same gene
            for qualifier_key in qualifiers_to_merge :
                if ( qualifier_key != qualifier_id ) :
                    gene_dict[feature.type][qualifier_id].extend(
                        gene_dict[feature.type][qualifier_key])
                    del gene_dict[feature.type][qualifier_key]
    # The error calculation has been extracted from the following sampling
    # statistics equation:
    #
    #                   N * Z^2 * p * (1-p)
    #         n = -------------------------------
    #              (N-1) * e^2 + Z^2 * p * (1-p)
    #
    # where N is the number of sequences, n is the minimum sampling size
    # (threshold), e is the error fixed to 0,01, Z is fixed to get a 0,99
    # confidence interval and p is assumed to be 0,5.
    e_value = 0.01
    z_value = 2.58
    p_value = 0.5
    coef = math.pow(z_value, 2) * p_value * (1.0 - p_value)
    threshold = math.ceil((num_seqs * coef) / \
                          ((num_seqs - 1.0) * math.pow(e_value, 2) + coef))
    # If no log file path is provided, save log content in a named temporary
    # file that won't be deleted after the function ends
    if ( not log_file ) :
        log_file = (tempfile.NamedTemporaryFile(delete=False)).name
    # Clean-up empty features and merge qualifiers dict keys with features dict
    # keys to get a {str: list} dict for all the genes
    set_dict = {}
    with open(log_file, 'w') as log :
        for feat_key, feat_value in iter(viewitems(gene_dict)) :
            if ( feat_value ) :
                log.write('> {}\n'.format(feat_key))
                for qual_key, qual_value in iter(viewitems(feat_value)) :
                    # Generation of the content of the set dictionary that will
                    # be returned
                    new_key = '{}.{}'.format(feat_key, qual_key.split(':')[0])
                    set_dict.setdefault(new_key, []).extend(qual_value)
                    # For every existing pair of qualifiers, if the number of 
                    # records that hold both is below the calculated threshold,
                    # it might be the result of a typo in those records'
                    # information (further review of the log file is advisable)
                    for pair in itertools.combinations(qual_key.split(':'), 2) :
                        if ( pair in term_dict[feat_key] ) :
                            sampling_size = len(term_dict[feat_key][pair])
                            if ( sampling_size < threshold ) :
                                seq_list = list(term_dict[feat_key][pair])
                                text = '\t{}\n'.format(' || '.join(pair))
                                for i in range(0, sampling_size // 6 + 1):
                                    text += '\t\t{}\n'.format(
                                                ' '.join(seq_list[i*6:(i+1)*6]))
                                if ( (sampling_size % 6) != 0 ) :
                                    text += '\n'
                                log.write(text)
                log.write('\n')
    # If no reference sequence has been introduced, include in the gene dict
    # those sequences that couldn't be processed due to lack of information
    if ( not ref_seq ) :
        set_dict['unprocessable'] = unprocessable
    return ( set_dict )