Esempio n. 1
0
def _stockholm_to_tabular_msa(fh, constructor=None):
    # Checks that user has passed required constructor parameter
    if constructor is None:
        raise ValueError("Must provide `constructor` parameter indicating the "
                         "type of sequences in the alignment. `constructor` "
                         "must be a subclass of `GrammaredSequence` "
                         "(e.g., `DNA`, `RNA`, `Protein`).")
    # Checks that contructor parameter is supported
    elif not issubclass(constructor, GrammaredSequence):
        raise TypeError("`constructor` must be a subclass of "
                        "`GrammaredSequence`.")

    # Checks that the file isn't empty
    try:
        line = next(fh)
    except StopIteration:
        raise StockholmFormatError("File is empty.")
    # Checks that the file follows basic format (includes the required header)
    if not _is_header(line):
        raise StockholmFormatError("File missing required Stockholm header "
                                   "line.")
    msa_data = _MSAData()
    for line in fh:
        if line.isspace():
            continue

        line = line.rstrip('\n')

        if _is_sequence_line(line):
            seq_name, seq_data = _parse_sequence_line(line)
            msa_data.add_sequence(seq_name, seq_data)
        elif line.startswith("#=GF"):
            feature_name, feature_data = _parse_gf_line(line)
            msa_data.add_gf_metadata(feature_name, feature_data)
        elif line.startswith("#=GS"):
            seq_name, feature_name, feature_data = _parse_gs_line(line)
            msa_data.add_gs_metadata(seq_name, feature_name, feature_data)
        elif line.startswith("#=GR"):
            seq_name, feature_name, feature_data = _parse_gr_line(line)
            msa_data.add_gr_metadata(seq_name, feature_name, feature_data)
        elif line.startswith('#=GC'):
            feature_name, feature_data = _parse_gc_line(line)
            msa_data.add_gc_metadata(feature_name, feature_data)
        elif _is_footer(line):
            break
        else:
            raise StockholmFormatError("Unrecognized line: %r" % line)

    if not _is_footer(line):
        raise StockholmFormatError('Final line does not conform to Stockholm '
                                   'format. Must contain only "//".')

    return msa_data.build_tabular_msa(constructor)
Esempio n. 2
0
def _format_positional_metadata(df, data_type):
    # Asserts positional metadata feature names are unique
    if not df.columns.is_unique:
        num_repeated_columns = len(df.columns) - len(set(df.columns))
        raise StockholmFormatError('%s feature names must be unique. '
                                   'Found %d duplicate names.'
                                   % (data_type, num_repeated_columns))

    str_df = df.astype(str)

    # Asserts positional metadata dataframe items are one character long
    for column in str_df.columns:
        if (str_df[column].str.len() != 1).any():
            raise StockholmFormatError("%s must contain a single character for"
                                       " each position's value. Found value(s)"
                                       " in column %s of incorrect length."
                                       % (data_type, column))
    return str_df
Esempio n. 3
0
 def add_gf_metadata(self, feature_name, feature_data):
     # Handles first instance of labelled tree
     if feature_name == 'TN' and 'NH' not in self._metadata:
         self._metadata['NH'] = OrderedDict()
         self._metadata['NH'][feature_data] = ''
     # Handles second instance of labelled tree
     elif feature_name == 'TN' and 'NH' in self._metadata:
         if feature_data in self._metadata['NH']:
             raise StockholmFormatError("Tree name %r used multiple times "
                                        "in file." % feature_data)
         self._metadata['NH'][feature_data] = ''
     # Handles extra line(s) of an already created tree
     elif feature_name == 'NH' and feature_name in self._metadata:
         trees = self._metadata[feature_name]
         if isinstance(trees, OrderedDict):
             tree_id = next(reversed(trees))
             self._metadata[feature_name][tree_id] = (trees[tree_id] +
                                                      feature_data)
         else:
             self._metadata[feature_name] = (self._metadata[feature_name] +
                                             feature_data)
     elif feature_name == 'RN':
         if feature_name not in self._metadata:
             self._metadata[feature_name] = [OrderedDict()]
         else:
             self._metadata[feature_name].append(OrderedDict())
     elif feature_name in _REFERENCE_TAGS:
         if 'RN' not in self._metadata:
             raise StockholmFormatError("Expected 'RN' tag to precede "
                                        "'%s' tag." % feature_name)
         reference_dict = self._metadata['RN'][-1]
         if feature_name not in reference_dict:
             reference_dict[feature_name] = feature_data
         else:
             padding = _get_padding(reference_dict[feature_name])
             reference_dict[feature_name] += padding + feature_data
     elif feature_name in self._metadata:
         padding = _get_padding(self._metadata[feature_name][-1])
         self._metadata[feature_name] = (self._metadata[feature_name] +
                                         padding + feature_data)
     else:
         self._metadata[feature_name] = feature_data
Esempio n. 4
0
def _tabular_msa_to_stockholm(obj, fh):
    if not obj.index.is_unique:
        raise StockholmFormatError("The TabularMSA's index labels must be"
                                   " unique.")
    # Writes header
    fh.write("# STOCKHOLM 1.0\n")

    # Writes GF data to file
    if obj.has_metadata():
        for gf_feature, gf_feature_data in viewitems(obj.metadata):
            if gf_feature == 'NH' and isinstance(gf_feature_data, dict):
                for tree_id, tree in viewitems(obj.metadata[gf_feature]):
                    fh.write("#=GF TN %s\n" % tree_id)
                    fh.write("#=GF NH %s\n" % tree)
            else:
                fh.write("#=GF %s %s\n" % (gf_feature, gf_feature_data))

    unpadded_data = []
    # Writes GS data to file, retrieves GR data, and retrieves sequence data
    for seq, seq_name in zip(obj, obj.index):
        seq_name = str(seq_name)
        if seq.has_metadata():
            for gs_feature, gs_feature_data in viewitems(seq.metadata):
                fh.write("#=GS %s %s %s\n" %
                         (seq_name, gs_feature, gs_feature_data))
        unpadded_data.append((seq_name, str(seq)))
        if seq.has_positional_metadata():
            df = _format_positional_metadata(
                seq.positional_metadata, 'Sequence-specific positional '
                'metadata (GR)')
            for gr_feature in df.columns:
                gr_feature_data = ''.join(df[gr_feature])
                gr_string = "#=GR %s %s" % (seq_name, gr_feature)
                unpadded_data.append((gr_string, gr_feature_data))

    # Retrieves GC data
    if obj.has_positional_metadata():
        df = _format_positional_metadata(
            obj.positional_metadata, 'Multiple sequence alignment '
            'positional metadata (GC)')
        for gc_feature in df.columns:
            gc_feature_data = ''.join(df[gc_feature])
            gc_string = "#=GC %s" % gc_feature
            unpadded_data.append((gc_string, gc_feature_data))

    # Writes GR, GC, and raw data to file with padding
    _write_padded_data(unpadded_data, fh)

    # Writes footer
    fh.write("//\n")
Esempio n. 5
0
 def add_gf_metadata(self, feature_name, feature_data):
     # Handles first instance of labelled tree
     if feature_name == 'TN' and 'NH' not in self._metadata:
         self._metadata['NH'] = OrderedDict()
         self._metadata['NH'][feature_data] = ''
     # Handles second instance of labelled tree
     elif feature_name == 'TN' and 'NH' in self._metadata:
         if feature_data in self._metadata['NH']:
             raise StockholmFormatError("Tree name %r used multiple times "
                                        "in file." % feature_data)
         self._metadata['NH'][feature_data] = ''
     # Handles extra line(s) of an already created tree
     elif feature_name == 'NH' and feature_name in self._metadata:
         trees = self._metadata[feature_name]
         tree_id = list(trees.keys())[-1]
         self._metadata[feature_name][tree_id] = (trees[tree_id] +
                                                  feature_data)
     elif feature_name in self._metadata:
         self._metadata[feature_name] = (self._metadata[feature_name] +
                                         feature_data)
     else:
         self._metadata[feature_name] = feature_data
Esempio n. 6
0
    def build_tabular_msa(self, constructor):
        if len(self._seqs) != len(self._seq_order):
            invalid_seq_names = set(self._seqs) - set(self._seq_order)
            raise StockholmFormatError('Found GS or GR metadata for '
                                       'nonexistent sequence(s): %r'
                                       % invalid_seq_names)

        seqs = []
        for seq_name in self._seq_order:
            seqs.append(self._seqs[seq_name].build_sequence(constructor))

        positional_metadata = self._positional_metadata
        if not positional_metadata:
            positional_metadata = None

        metadata = self._metadata
        if not metadata:
            metadata = None

        # Constructs TabularMSA
        return TabularMSA(seqs, metadata=metadata,
                          positional_metadata=positional_metadata,
                          index=self._seq_order)
Esempio n. 7
0
def _check_for_malformed_line(line, expected_len):
    if len(line) != expected_len:
        raise StockholmFormatError('Line contains %d item(s). It must '
                                   'contain exactly %d item(s).'
                                   % (len(line), expected_len))
Esempio n. 8
0
def _raise_duplicate_error(message):
    raise StockholmFormatError(message+' Note: If the file being used is in '
                                       'Stockholm interleaved format, this '
                                       'is not supported by the reader.')
Esempio n. 9
0
def _tabular_msa_to_stockholm(obj, fh):
    if not obj.index.is_unique:
        raise StockholmFormatError("The TabularMSA's index labels must be"
                                   " unique.")
    # Writes header
    fh.write("# STOCKHOLM 1.0\n")

    # Writes GF data to file
    if obj.has_metadata():
        for gf_feature, gf_feature_data in obj.metadata.items():
            if gf_feature == 'NH' and isinstance(gf_feature_data, dict):
                for tree_id, tree in gf_feature_data.items():
                    fh.write("#=GF TN %s\n" % tree_id)
                    fh.write("#=GF NH %s\n" % tree)
            elif gf_feature == 'RN':
                if not isinstance(gf_feature_data, list):
                    raise StockholmFormatError(
                        "Expected 'RN' to contain a list of reference "
                        "dictionaries, got %r." % gf_feature_data)

                for ref_num, dictionary in enumerate(gf_feature_data, start=1):
                    if not isinstance(dictionary, dict):
                        raise StockholmFormatError(
                            "Expected reference information to be stored as a "
                            "dictionary, found reference %d stored as %r." %
                            (ref_num, type(dictionary).__name__))

                    fh.write("#=GF RN [%d]\n" % ref_num)
                    for feature in dictionary:
                        if feature not in _REFERENCE_TAGS:
                            formatted_reference_tags = ', '.join(
                                [tag for tag in _REFERENCE_TAGS])
                            raise StockholmFormatError(
                                "Invalid reference tag %r found in reference "
                                "dictionary %d. Valid reference tags are: %s."
                                % (feature, ref_num, formatted_reference_tags))

                        fh.write("#=GF %s %s\n" %
                                 (feature, dictionary[feature]))
            else:
                fh.write("#=GF %s %s\n" % (gf_feature, gf_feature_data))

    unpadded_data = []
    # Writes GS data to file, retrieves GR data, and retrieves sequence data
    for seq, seq_name in zip(obj, obj.index):
        seq_name = str(seq_name)

        if seq.has_metadata():
            for gs_feature, gs_feature_data in seq.metadata.items():
                fh.write("#=GS %s %s %s\n" %
                         (seq_name, gs_feature, gs_feature_data))

        unpadded_data.append((seq_name, str(seq)))
        if seq.has_positional_metadata():
            df = _format_positional_metadata(
                seq.positional_metadata, 'Sequence-specific positional '
                'metadata (GR)')
            for gr_feature in df.columns:
                gr_feature_data = ''.join(df[gr_feature])
                gr_string = "#=GR %s %s" % (seq_name, gr_feature)
                unpadded_data.append((gr_string, gr_feature_data))

    # Retrieves GC data
    if obj.has_positional_metadata():
        df = _format_positional_metadata(
            obj.positional_metadata, 'Multiple sequence alignment '
            'positional metadata (GC)')
        for gc_feature in df.columns:
            gc_feature_data = ''.join(df[gc_feature])
            gc_string = "#=GC %s" % gc_feature
            unpadded_data.append((gc_string, gc_feature_data))

    # Writes GR, GC, and raw data to file with padding
    _write_padded_data(unpadded_data, fh)

    # Writes footer
    fh.write("//\n")