Example #1
0
def id_mapper(in_file, id_file, out_file, params, use_centroid_rt,
              use_centroid_mz, use_subelements):

    in_type = pms.FileHandler.getType(in_file)

    protein_ids = []
    peptide_ids = []

    pms.IdXMLFile().load(id_file, protein_ids, peptide_ids)

    mapper = pms.IDMapper()
    mapper.setParameters(params)

    if in_type == pms.Type.CONSENSUSXML:
        file_ = pms.ConsensusXMLFile()
        map_ = pms.ConsensusMap()
        file_.load(in_file, map_)
        mapper.annotate(map_, peptide_ids, protein_ids, use_subelements)
        addDataProcessing(
            map_, params,
            pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        file_.store(out_file, map_)

    elif in_type == pms.Type.FEATUREXML:
        file_ = pms.FeatureXMLFile()
        map_ = pms.FeatureMap()
        file_.load(in_file, map_)
        mapper.annotate(map_, peptide_ids, protein_ids, use_centroid_rt,
                        use_centroid_mz)
        addDataProcessing(
            map_, params,
            pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        file_.store(out_file, map_)

    elif in_type == pms.Type.MZQ:
        file_ = pms.MzQuantMLFile()
        msq = pms.MSQuantifications()
        file_.load(in_file, msq)
        maps = msq.getConsensusMaps()
        for map_ in maps:
            mapper.annotate(map_, peptide_ids, protein_ids, use_subelements)
            addDataProcessing(
                map_, params,
                pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        msq.setConsensusMaps(maps)
        file_.store(out_file, msq)

    else:
        raise Exception("invalid input file format")
def link(in_files, out_file, keep_subelements, params):

    in_types = set(pms.FileHandler.getType(in_) for in_ in in_files)

    if in_types == set((pms.Type.CONSENSUSXML, )):
        link_features = False
    elif in_types == set((pms.Type.FEATUREXML, )):
        link_features = True
    else:
        raise Exception("different kinds of input files")

    algorithm_parameters = params.copy("algorithm:", True)
    algorithm = pms.FeatureGroupingAlgorithmQT()
    algorithm.setParameters(algorithm_parameters)

    out_map = pms.ConsensusMap()
    fds = out_map.getColumnHeaders()
    if link_features:
        f = pms.FeatureXMLFile()
        maps = []
        for i, in_file in enumerate(in_files):
            map_ = pms.FeatureMap()
            f.load(in_file, map_)

            # set filedescriptions
            fd = fds.get(i, pms.ColumnHeader())
            fd.filename = in_file
            fd.size = map_.size()
            fd.unique_id = map_.getUniqueId()
            fds[i] = fd
            maps.append(map_)
        out_map.setColumnHeaders(fds)
        algorithm.group(maps, out_map)
    else:
        f = pms.ConsensusXMLFile()
        maps = []
        for i, in_file in enumerate(in_files):
            map_ = pms.ConsensusMap()
            f.load(in_file, map_)
            maps.append(map_)
        algorithm.group(maps, out_map)

        if not keep_subelements:
            for i in range(len(in_files)):
                # set filedescriptions
                fd = fds.get(i, pms.ColumnHeader())
                fd.filename = in_files[i]
                fd.size = maps[i].size()
                fd.unique_id = maps[i].getUniqueId()
                fds[i] = fd
            out_map.setColumnHeaders(fds)
        else:
            algorithm.transferSubelements(maps, out_map)

    out_map.setUniqueIds()
    addDataProcessing(out_map, params,
                      pms.DataProcessing.ProcessingAction.FEATURE_GROUPING)

    pms.ConsensusXMLFile().store(out_file, out_map)

    sizes = []
    for feat in out_map:
        sizes.append(feat.size())

    c = Counter(sizes)
    print "Number of consensus features:"
    for size, count in c.most_common():
        print "   of size %2d : %6d" % (size, count)
    print "        total : %6d" % out_map.size()
Example #3
0
def align_feature_xmls(feature_xml_lis,
                       consensus_map_out_path="",
                       class_label_dict={}):
    """
    first apply pose clustering to include all features maps
      next link/group them across all features

    Each MS1 spectrum from raw-file will create a feature file -
    we need to load and align them to get unique and representative features
    :param feature_xml_lis:
    :param consensus_map_out_path:
    :return: consensus_map, consensus_map_out_path, measurement_names
    """
    # do consensus map normalization and export -
    # can't hack normalization together from lack of example usage and poor signature
    #   - no normalization implemented

    # openms won't deal with posix paths - wants to have strings instead
    # need to make sure it get's those
    # let's sort them to make sure feature matrix is also sorted
    feature_xml_lis = sorted([str(fx) for fx in feature_xml_lis])

    num_features_list = []
    for current_feature_xml_path in feature_xml_lis:
        # load features into FeatureMaps
        cm = oms.FeatureMap()  # current_map
        oms.FeatureXMLFile().load(current_feature_xml_path, cm)
        # list_functions(current_map, prefix="")
        num_features_list.append(cm.size())
        del cm

    # should choose the feature file / experiment with most features as reference
    max_index = np.argmax(num_features_list)
    reference_map_path = feature_xml_lis[max_index]

    default_max_num_peaks_considered = 1000
    default_max_scaling_value = 10.0
    aligned_paths = []
    for i, current_feature_xml_path in enumerate(feature_xml_lis):
        # load features into FeatureMaps
        reference_map = oms.FeatureMap(
        )  # pairwise alignment - so need master map -
        oms.FeatureXMLFile().load(reference_map_path, reference_map)

        current_map = oms.FeatureMap()
        oms.FeatureXMLFile().load(current_feature_xml_path, current_map)

        # create a transformation description required as init for aligner
        transformation_description = oms.TransformationDescription()

        # adjust max scaling parameter otherwise leads to error when running with algae samples
        # adjust max num peaks to 2k - also would leads to error when running with algae samples

        aligner = oms.MapAlignmentAlgorithmPoseClustering()
        aligner_params = aligner.getParameters()

        # print(aligner_params.asDict().keys())
        max_scaling_key = b'superimposer:max_scaling'
        # aligner_params.getEntry(max_scaling_key)
        aligner_params.setValue(max_scaling_key, default_max_scaling_value)

        max_num_peaks_key = b'max_num_peaks_considered'
        # aligner_params.getEntry(max_num_peaks_key)
        aligner_params.setValue(
            max_num_peaks_key,
            default_max_num_peaks_considered)  # default = 1000
        # need higher default for algae

        # decrease runtime by removing weak signals
        # print(aligner_params.asDict())
        num_used_points_key = b'superimposer:num_used_points'
        # aligner_params.getEntry(num_used_points_key)
        aligner_params.setValue(
            num_used_points_key,
            1000)  # half the default parameter, speed up alignment

        aligner.setParameters(aligner_params)

        aligner.setReference(reference_map)

        try:
            # run alignment
            aligner.align(current_map, transformation_description)
        except RuntimeError as re:
            if 'max_num_peaks_considered' in str(re):
                # retry with higher threshold - required for algae dataset
                default_max_num_peaks_considered = 15000  # 15 fold - makes it a lot slower but less error prone
                aligner_params.setValue(max_num_peaks_key,
                                        default_max_num_peaks_considered)
                default_max_scaling_value = 20.0  # need to increase to 20
                aligner_params.setValue(max_scaling_key,
                                        default_max_scaling_value)

                # max shift could also be off - issue for ckd dataset
                default_max_shift_value = 2000.0  # need to increase from 1000 to 2000
                max_shift_key = b'superimposer:max_shift'
                aligner_params.setValue(max_shift_key, default_max_shift_value)

                print(
                    f"Encountered GC/MS Clustering issue - setting 'max_num_peaks_considered' to {default_max_num_peaks_considered}, 'superimposer:max_scaling' to {default_max_scaling_value} and 'superimposer:max_shift' to {default_max_shift_value}"
                )
                aligner.setParameters(aligner_params)
                aligner.setReference(reference_map)
                aligner.align(current_map, transformation_description)

        current_map.updateRanges()
        reference_map.updateRanges()

        # update feature XML files - both reference and current
        updated_current_map_path = default_store_aligned_feature_xml(
            current_map, current_feature_xml_path)
        updated_reference_path = default_store_aligned_feature_xml(
            reference_map, reference_map_path)
        reference_map_path = updated_reference_path

        aligned_paths.append(updated_current_map_path)
        print(f"Finished alignment of {i}/{len(feature_xml_lis)-1}")

    # also replace here with new reference we updated the reference map to
    aligned_paths[max_index] = reference_map_path

    #   link/group them across features to create consensus map

    grouper = oms.FeatureGroupingAlgorithmUnlabeled()
    # leave parameters default

    # according to openms documentation:
    #   b) Call "setReference", "addToGroup" (n times), "getResultMap" in that order.

    for i, current_feature_map_path in enumerate(aligned_paths):
        print(f"Grouping features {i}/{len(aligned_paths)-1}")
        current_map = oms.FeatureMap()
        oms.FeatureXMLFile().load(current_feature_map_path, current_map)

        if not i:
            # first iteration - use as reference
            grouper.setReference(i, current_map)

        else:
            grouper.addToGroup(i, current_map)

    # get consensus map
    consensus_map = grouper.getResultMap()

    # consensus map requires some mapping between ids and filenames - otherwise will complain
    print(f"Mapping aligned results back to class labels")
    class_label_fns = list(class_label_dict.keys())
    fds = {i: oms.ColumnHeader() for i, _ in enumerate(aligned_paths)}
    measurement_names = []
    for i, aligned_path in enumerate(aligned_paths):
        # fds[i].filename = b"file0"
        current_fn = f"{str(Path(aligned_path).stem)}{str(Path(aligned_path).suffix)}"

        # this is where we need to replace the feature_xml filenames with the ones from class_labels
        if class_label_dict:
            # could do longest substring match with each of the fns in class_label dict to find matching filename
            #   django will rename duplicate filenames instead of overwriting
            # or we expect both featureXML input and class_label_dict to be ordered - which they should be when using the getter
            fds[i].filename = class_label_fns[i]

        else:
            fds[i].filename = current_fn.encode(
                "UTF8")  # needs bytestring representation

        measurement_names.append(current_fn)

    consensus_map.setColumnHeaders(fds)

    #  cleanup aligned_feature_xmls - can be >30mb per file - so better remove them
    for ap in aligned_paths:
        os.remove(ap)

    #   do consensus map normalization and export to consensus files
    # using median normalization, also available are Quantile and "robust regression"
    normalizer = oms.ConsensusMapNormalizerAlgorithmMedian()

    # ConsensusMapNormalizerAlgorithmMedian
    # signature of class is more than incomplete ... *args **kwargs for required parameters is not the best implementation choice...
    # but gives TypeError requiring int when calling with
    # normalizer.normalizeMaps(consensus_map, "NM_SCALE", "", "") #
    """
    normalizer.normalizeMaps(map, method, acc_filter, desc_filter)
    map	ConsensusMap
    method	whether to use scaling or shifting to same median 
    acc_filter	string describing the regular expression for filtering accessions
    desc_filter	string describing the regular expression for filtering descriptions 
    """
    """
        method: probably 0 / 1 - referenced as Enumerator in OpenMS documentation
        from shell output can deduce normalization methods are
        0: NM_SCALE 	scale to same median using division/multiplication  
        1: NM_SHIFT 	shift using subtraction/addition
    """
    normalizer.normalizeMaps(consensus_map, 0, "", "")

    # don't export if not required - requires more file management
    # now export
    if consensus_map_out_path:
        print("Storing consensus xml")
        oms.ConsensusXMLFile().store(str(consensus_map_out_path),
                                     consensus_map)

    return consensus_map, measurement_names