Beispiel #1
0
    print "Creating LDA object with K = {}".format(options['K'])
    v = VariationalLDA(corpus[corpus.keys()[0]],
        K = int(options['K']),
        normalise = 1000.0,
        fixed_topics = db_spectra,
        fixed_topics_metadata = db_metadata)

    print "Running LDA for {} iterations".format(options['n_its'])

    v.run_vb(initialise = True,n_its = int(options['n_its']))

    print "Writing dictionary"
    output_dict_file = input_prefix + '_lda.dict'
    vd = v.make_dictionary(metadata = metadata,
        features = word_mz_range,
        compute_overlaps = True,
        filename = output_dict_file)

    print "Augmenting edge file"

    edge_file = input_prefix + '_edges.csv'
    original_edges = []
    with open(edge_file,'r') as f:
        reader = csv.reader(f)
        for edge in reader:
            original_edges.append(edge[:-1] + ['cosine'] + edge[-1:])

    print "Computing MS2LDA edges with p_thresh = {}, and overlap_thresh = {}".format(
        options['p_thresh'],options['overlap_thresh'])

    motifs = vd['beta'].keys()
Beispiel #2
0
def run_lda(corpus,
            metadata,
            word_mz_range,
            K,
            experiment_id,
            bin_width=0.005,
            n_its=1000,
            include_motifset=None):
    experiment = Experiment.objects.get(id=experiment_id)
    from motifdb.models import MDBMotifSet, MDBMotif
    if include_motifset:
        import ast
        # convert the string rep of the list into an actual list
        ic = ast.literal_eval(include_motifset)
        motifdb_spectra = {}
        motifdb_metadata = {}
        for mf in ic:
            mset = MDBMotifSet.objects.get(id=mf)
            temp_motifs = MDBMotif.objects.filter(motif_set=mset)
            for m in temp_motifs:
                new_motif_name = "{}(Exp:{})".format(m.name, experiment.id)
                fi = Mass2MotifInstance.objects.filter(mass2motif=m)
                motifdb_spectra[new_motif_name] = {}
                for f in fi:
                    motifdb_spectra[new_motif_name][
                        f.feature.name] = f.probability
                md = jsonpickle.decode(m.metadata)
                motifdb_metadata[new_motif_name] = {}
                for key, value in md.items():
                    motifdb_metadata[new_motif_name][key] = value

        # filter to remove duplicates
        print "Filtering motifs to remove duplicates"
        from motifdb.views import MotifFilter
        mf = MotifFilter(motifdb_spectra, motifdb_metadata)
        motifdb_spectra, motifdb_metadata = mf.filter()

        motifdb_features = set()
        for m, spec in motifdb_spectra.items():
            for f in spec:
                motifdb_features.add(f)

        fm = FeatureMatcher(motifdb_features,
                            word_mz_range,
                            bin_width=bin_width)
        motifdb_spectra = fm.convert(motifdb_spectra)

        # Add the motifdb features to avoid problems when loading the dict into vlda later

        added = 0
        for f in motifdb_features:
            if not f in word_mz_range:
                word_mz = float(f.split('_')[1])
                word_mz_min = word_mz - bin_width / 2
                word_mz_max = word_mz + bin_width / 2
                word_mz_range[f] = (word_mz_min, word_mz_max)
                added += 1

        print "Added {} features".format(added)

        vlda = VariationalLDA(corpus,
                              K=K,
                              normalise=1000.0,
                              fixed_topics=motifdb_spectra,
                              fixed_topics_metadata=motifdb_metadata)

        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,
                                        features=word_mz_range)

    else:
        print "Standard with no added motifs"
        vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)

        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,
                                        features=word_mz_range)
    return lda_dict
Beispiel #3
0
def run_lda(corpus, metadata, word_mz_range, K, n_its=1000):
    vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)
    vlda.run_vb(n_its=n_its, initialise=True)

    lda_dict = vlda.make_dictionary(metadata=metadata, features=word_mz_range)
    return lda_dict
from lda import VariationalLDA

#K = 300  # number of *new* topics
K = input_free_motifs  # number of *new* topics

vlda = VariationalLDA(corpus,
                      K=K,
                      normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)

# note that for real runs the number of iterations is recommended to be 1000 or higher
vlda.run_vb(initialise=True, n_its=input_iterations)

vd = vlda.make_dictionary(features=features,
                          metadata=metadata,
                          filename=output_prefix + '.dict')

from ms2lda_molnet_integration import write_output_files
write_output_files(vd,
                   pairs_file,
                   output_prefix,
                   metadata,
                   overlap_thresh=args.input_network_overlap,
                   p_thresh=args.input_network_pvalue,
                   X=args.input_network_topx,
                   motif_metadata=motifdb_metadata)

# Writing the report - ntoe that you might need to set the 'backend' argument
# for this method to work (see the method in lda.py) as it depends what on
# your system will render the pdf...
print "Added {} features".format(added)

from lda import VariationalLDA

#K = 300  # number of *new* topics
K = input_free_motifs  # number of *new* topics

vlda = VariationalLDA(corpus, K=K, normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)

# note that for real runs the number of iterations is recommended to be 1000 or higher
vlda.run_vb(initialise=True, n_its=input_iterations)

vd = vlda.make_dictionary(
    features=features, metadata=metadata, filename=output_prefix + '.dict')

from ms2lda_molnet_integration import write_output_files
write_output_files(vd, pairs_file, output_prefix, metadata,
                   overlap_thresh=args.input_network_overlap, p_thresh=args.input_network_pvalue,
                   X=args.input_network_topx, motif_metadata = motifdb_metadata)

# Writing the report - ntoe that you might need to set the 'backend' argument
# for this method to work (see the method in lda.py) as it depends what on
# your system will render the pdf...
from lda import write_topic_report
try:
    write_topic_report(vd,output_prefix+'_topic_report.pdf')
except:
    print("PDF Generation Failed")
Beispiel #6
0
def run_lda(corpus, metadata, word_mz_range, K, experiment_id, bin_width = 0.005,n_its=1000,include_motifset = None):
    experiment = Experiment.objects.get(id = experiment_id)
    from motifdb.models import MDBMotifSet,MDBMotif
    if include_motifset:
        import ast
        # convert the string rep of the list into an actual list
        ic = ast.literal_eval(include_motifset)
        motifdb_spectra = {}
        motifdb_metadata = {}
        for mf in ic:
            mset = MDBMotifSet.objects.get(id = mf)
            temp_motifs = MDBMotif.objects.filter(motif_set = mset)
            for m in temp_motifs:
                new_motif_name = "{}(Exp:{})".format(m.name,experiment.id)
                fi = Mass2MotifInstance.objects.filter(mass2motif = m)
                motifdb_spectra[new_motif_name] = {}
                for f in fi:
                    motifdb_spectra[new_motif_name][f.feature.name] = f.probability
                md = jsonpickle.decode(m.metadata)
                motifdb_metadata[new_motif_name] = {}
                for key,value in md.items():
                    motifdb_metadata[new_motif_name][key] = value

        # filter to remove duplicates
        print "Filtering motifs to remove duplicates"
        from motifdb.views import MotifFilter
        mf = MotifFilter(motifdb_spectra,motifdb_metadata)
        motifdb_spectra,motifdb_metadata = mf.filter()

        motifdb_features = set()
        for m,spec in motifdb_spectra.items():
            for f in spec:
                motifdb_features.add(f)

        fm = FeatureMatcher(motifdb_features, word_mz_range, bin_width = bin_width)
        motifdb_spectra = fm.convert(motifdb_spectra)

        # Add the motifdb features to avoid problems when loading the dict into vlda later
        
        added = 0
        for f in motifdb_features:
            if not f in word_mz_range:
                word_mz = float(f.split('_')[1])
                word_mz_min = word_mz - bin_width / 2
                word_mz_max = word_mz + bin_width / 2
                word_mz_range[f] = (word_mz_min, word_mz_max)
                added += 1

        print "Added {} features".format(added)

        vlda = VariationalLDA(corpus, K=K, normalise=1000.0,
                      fixed_topics=motifdb_spectra,
                      fixed_topics_metadata=motifdb_metadata)
        
        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata,features=word_mz_range)
        
    else:
        print "Standard with no added motifs"
        vlda = VariationalLDA(corpus=corpus, K=K, normalise=1000.0)
    
        vlda.run_vb(n_its=n_its, initialise=True)

        lda_dict = vlda.make_dictionary(metadata=metadata, features=word_mz_range)
    return lda_dict