Beispiel #1
0
def msfile2corpus(ms2_file, ms2_format, min_ms1_intensity, min_ms2_intensity,
                  mz_tol, rt_tol, feature_set_name, k, corpusjson):
    if ms2_format == 'mzxml':
        loader = LoadMZML(mz_tol=mz_tol,
                          rt_tol=rt_tol,
                          peaklist=None,
                          min_ms1_intensity=min_ms1_intensity,
                          min_ms2_intensity=min_ms2_intensity)
    elif ms2_format == 'msp':
        loader = LoadMSP(min_ms1_intensity=min_ms1_intensity,
                         min_ms2_intensity=min_ms2_intensity,
                         mz_tol=mz_tol,
                         rt_tol=rt_tol,
                         peaklist=None,
                         name_field="")
    elif ms2_format == 'mgf':
        loader = LoadMGF(min_ms1_intensity=min_ms1_intensity,
                         min_ms2_intensity=min_ms2_intensity,
                         mz_tol=mz_tol,
                         rt_tol=rt_tol,
                         peaklist=None,
                         name_field="")
    else:
        raise NotImplementedError('Unknown ms2 format')
    ms1, ms2, metadata = loader.load_spectra([ms2_file])

    bin_widths = {
        'binned_005': 0.005,
        'binned_01': 0.01,
        'binned_05': 0.05,
        'binned_1': 0.1,
        'binned_5': 0.5
    }

    bin_width = bin_widths[feature_set_name]

    fm = MakeBinnedFeatures(bin_width=bin_width)
    corpus, features = fm.make_features(ms2)
    corpus = corpus[corpus.keys()[0]]

    # To insert in db some additional data is generated inVariationalLDA
    vlda = VariationalLDA(corpus=corpus, K=k)
    lda_dict = {
        'corpus': corpus,
        'word_index': vlda.word_index,
        'doc_index': vlda.doc_index,
        'doc_metadata': metadata,
        'topic_index': vlda.topic_index,
        'topic_metadata': vlda.topic_metadata,
        'features': features
    }
    json.dump(lda_dict, corpusjson)
Beispiel #2
0
        from motifdb_loader import load_db,MotifFilter
        
        db_list = options['motifdb'].split()
        db_spectra,db_metadata = load_db(db_list,db_path)
        db_spectra,db_metadata = MotifFilter(db_spectra,db_metadata).filter()
    else:
        db_spectra = None
        db_metadata = None




    print "Creating LDA object with K = {}".format(options['K'])
    v = VariationalLDA(corpus[corpus.keys()[0]],
        K = int(options['K']),
        normalise = 1000.0,
        fixed_topics = db_spectra,
        fixed_topics_metadata = db_metadata)

    print "Running LDA for {} iterations".format(options['n_its'])

    v.run_vb(initialise = True,n_its = int(options['n_its']))

    print "Writing dictionary"
    output_dict_file = input_prefix + '_lda.dict'
    vd = v.make_dictionary(metadata = metadata,
        features = word_mz_range,
        compute_overlaps = True,
        filename = output_dict_file)

    print "Augmenting edge file"
Beispiel #3
0
def run_lda(corpus,
            metadata,
            word_mz_range,
            K,
            experiment_id,
            bin_width=0.005,
            n_its=1000,
            include_motifset=None):
    experiment = Experiment.objects.get(id=experiment_id)
    from motifdb.models import MDBMotifSet, MDBMotif
    if include_motifset:
        import ast
        # convert the string rep of the list into an actual list
        ic = ast.literal_eval(include_motifset)
        motifdb_spectra = {}
        motifdb_metadata = {}
        for mf in ic:
            mset = MDBMotifSet.objects.get(id=mf)
            temp_motifs = MDBMotif.objects.filter(motif_set=mset)
            for m in temp_motifs:
                new_motif_name = "{}(Exp:{})".format(m.name, experiment.id)
                fi = Mass2MotifInstance.objects.filter(mass2motif=m)
                motifdb_spectra[new_motif_name] = {}
                for f in fi:
                    motifdb_spectra[new_motif_name][
                        f.feature.name] = f.probability
                md = jsonpickle.decode(m.metadata)
                motifdb_metadata[new_motif_name] = {}
                for key, value in md.items():
                    motifdb_metadata[new_motif_name][key] = value

        # filter to remove duplicates
        print "Filtering motifs to remove duplicates"
        from motifdb.views import MotifFilter
        mf = MotifFilter(motifdb_spectra, motifdb_metadata)
        motifdb_spectra, motifdb_metadata = mf.filter()

        motifdb_features = set()
        for m, spec in motifdb_spectra.items():
            for f in spec:
                motifdb_features.add(f)

        fm = FeatureMatcher(motifdb_features,
                            word_mz_range,
                            bin_width=bin_width)
        motifdb_spectra = fm.convert(motifdb_spectra)

        # Add the motifdb features to avoid problems when loading the dict into vlda later

        added = 0
        for f in motifdb_features:
            if not f in word_mz_range:
                word_mz = float(f.split('_')[1])
                word_mz_min = word_mz - bin_width / 2
                word_mz_max = word_mz + bin_width / 2
                word_mz_range[f] = (word_mz_min, word_mz_max)
                added += 1

        print "Added {} features".format(added)

        vlda = VariationalLDA(corpus,
                              K=K,
                              normalise=1000.0,
                              fixed_topics=motifdb_spectra,
                              fixed_topics_metadata=motifdb_metadata)

        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,
                                        features=word_mz_range)

    else:
        print "Standard with no added motifs"
        vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)

        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,
                                        features=word_mz_range)
    return lda_dict
Beispiel #4
0
def run_lda(corpus, metadata, word_mz_range, K, n_its=1000):
    vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)
    vlda.run_vb(n_its=n_its, initialise=True)

    lda_dict = vlda.make_dictionary(metadata=metadata, features=word_mz_range)
    return lda_dict
        word_mz = float(f.split('_')[1])
        word_mz_min = word_mz - bin_width / 2
        word_mz_max = word_mz + bin_width / 2
        features[f] = (word_mz_min, word_mz_max)
        added += 1

print "Added {} features".format(added)

from lda import VariationalLDA

#K = 300  # number of *new* topics
K = input_free_motifs  # number of *new* topics

vlda = VariationalLDA(corpus,
                      K=K,
                      normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)

# note that for real runs the number of iterations is recommended to be 1000 or higher
vlda.run_vb(initialise=True, n_its=input_iterations)

vd = vlda.make_dictionary(features=features,
                          metadata=metadata,
                          filename=output_prefix + '.dict')

from ms2lda_molnet_integration import write_output_files
write_output_files(vd,
                   pairs_file,
                   output_prefix,
                   metadata,
    if not f in features:
        word_mz = float(f.split('_')[1])
        word_mz_min = word_mz - bin_width / 2
        word_mz_max = word_mz + bin_width / 2
        features[f] = (word_mz_min, word_mz_max)
        added += 1

print "Added {} features".format(added)

from lda import VariationalLDA

#K = 300  # number of *new* topics
K = input_free_motifs  # number of *new* topics

vlda = VariationalLDA(corpus, K=K, normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)

# note that for real runs the number of iterations is recommended to be 1000 or higher
vlda.run_vb(initialise=True, n_its=input_iterations)

vd = vlda.make_dictionary(
    features=features, metadata=metadata, filename=output_prefix + '.dict')

from ms2lda_molnet_integration import write_output_files
write_output_files(vd, pairs_file, output_prefix, metadata,
                   overlap_thresh=args.input_network_overlap, p_thresh=args.input_network_pvalue,
                   X=args.input_network_topx, motif_metadata = motifdb_metadata)

# Writing the report - ntoe that you might need to set the 'backend' argument
# for this method to work (see the method in lda.py) as it depends what on
Beispiel #7
0
def run_lda(corpus, metadata, word_mz_range, K, experiment_id, bin_width = 0.005,n_its=1000,include_motifset = None):
    experiment = Experiment.objects.get(id = experiment_id)
    from motifdb.models import MDBMotifSet,MDBMotif
    if include_motifset:
        import ast
        # convert the string rep of the list into an actual list
        ic = ast.literal_eval(include_motifset)
        motifdb_spectra = {}
        motifdb_metadata = {}
        for mf in ic:
            mset = MDBMotifSet.objects.get(id = mf)
            temp_motifs = MDBMotif.objects.filter(motif_set = mset)
            for m in temp_motifs:
                new_motif_name = "{}(Exp:{})".format(m.name,experiment.id)
                fi = Mass2MotifInstance.objects.filter(mass2motif = m)
                motifdb_spectra[new_motif_name] = {}
                for f in fi:
                    motifdb_spectra[new_motif_name][f.feature.name] = f.probability
                md = jsonpickle.decode(m.metadata)
                motifdb_metadata[new_motif_name] = {}
                for key,value in md.items():
                    motifdb_metadata[new_motif_name][key] = value

        # filter to remove duplicates
        print "Filtering motifs to remove duplicates"
        from motifdb.views import MotifFilter
        mf = MotifFilter(motifdb_spectra,motifdb_metadata)
        motifdb_spectra,motifdb_metadata = mf.filter()

        motifdb_features = set()
        for m,spec in motifdb_spectra.items():
            for f in spec:
                motifdb_features.add(f)

        fm = FeatureMatcher(motifdb_features, word_mz_range, bin_width = bin_width)
        motifdb_spectra = fm.convert(motifdb_spectra)

        # Add the motifdb features to avoid problems when loading the dict into vlda later
        
        added = 0
        for f in motifdb_features:
            if not f in word_mz_range:
                word_mz = float(f.split('_')[1])
                word_mz_min = word_mz - bin_width / 2
                word_mz_max = word_mz + bin_width / 2
                word_mz_range[f] = (word_mz_min, word_mz_max)
                added += 1

        print "Added {} features".format(added)

        vlda = VariationalLDA(corpus, K=K, normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)
        
        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,features=word_mz_range)
        
    else:
        print "Standard with no added motifs"
        vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)
    
        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata, features=word_mz_range)
    return lda_dict