Ejemplo n.º 1
0
    def load(self):
        try:
            tag_recommendation_data = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json')
            DATABASE = tag_recommendation_data['database']
            CLASSES = tag_recommendation_data['classes']
            self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES)
            self.cbtr.load_recommenders()

        except:
            self.cbtr = None
            logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).")

        try:
            self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_index_stats.json')
            logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix'])
        except Exception as e:
            print(e)
            self.index_stats = {
                'n_sounds_in_matrix': 0,
            }

        try:
            self.index = loadFromJson(RECOMMENDATION_DATA_DIR + 'Index.json')
            self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()])
            self.index_stats['n_sounds_in_index'] = len(self.index.keys())
        except Exception as e:
            logger.info("Index file not present. Listening for indexing data from appservers.")
            self.index_stats['biggest_id_in_index'] = 0
            self.index_stats['n_sounds_in_index'] = 0
            self.index = dict()
Ejemplo n.º 2
0
    def load(self):
        try:
            tag_recommendation_data = loadFromJson(
                RECOMMENDATION_DATA_DIR +
                'Current_database_and_class_names.json')
            DATABASE = tag_recommendation_data['database']
            CLASSES = tag_recommendation_data['classes']
            self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE,
                                                     classes=CLASSES)
            self.cbtr.load_recommenders()

        except:
            self.cbtr = None
            logger.info(
                "No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come)."
            )

        try:
            self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR +
                                            'Current_index_stats.json')
            logger.info("Matrices computed out of information from %i sounds" %
                        self.index_stats['n_sounds_in_matrix'])
        except Exception, e:
            print e
            self.index_stats = {
                'n_sounds_in_matrix': 0,
            }
Ejemplo n.º 3
0
    def process_tag_recommendation_data(self,
                                        resources_limit=None,
                                        tag_threshold=10,
                                        line_limit=99999999999999,
                                        recompute_all_classes=False,
                                        similarity_metric="cosine"):

        # Process tas file and turn into association matrix and derived files
        database_name = self.tas_to_association_matrix(
            tag_threshold=tag_threshold, line_limit=line_limit)

        print "Loading community detector..."
        cd = CommunityDetector(verbose=False,
                               PATH=RECOMMENDATION_DATA_DIR + "Classifier")
        print cd

        # Classify existing resources
        resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR +
                                      database_name + '_RESOURCES_TAGS.json')
        instances_ids = resources_tags.keys()
        try:
            resource_class = loadFromJson(
                RECOMMENDATION_DATA_DIR +
                'Classifier_classified_resources.json')
        except Exception, e:
            resource_class = dict()
Ejemplo n.º 4
0
    def load(self):
        try:
            tag_recommendation_data = loadFromJson(
                tr_settings.RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json')
            DATABASE = tag_recommendation_data['database']
            CLASSES = tag_recommendation_data['classes']
            self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES)
            self.cbtr.load_recommenders()

        except:
            self.cbtr = None
            logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).")

        try:
            self.index_stats = loadFromJson(tr_settings.RECOMMENDATION_DATA_DIR + 'Current_index_stats.json')
            logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix'])
        except Exception as e:
            print(e)
            self.index_stats = {
                'n_sounds_in_matrix': 0,
            }

        try:
            self.index = loadFromJson(tr_settings.RECOMMENDATION_DATA_DIR + 'Index.json')
            self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()])
            self.index_stats['n_sounds_in_index'] = len(self.index.keys())
        except Exception as e:
            logger.info("Index file not present. Listening for indexing data from appservers.")
            self.index_stats['biggest_id_in_index'] = 0
            self.index_stats['n_sounds_in_index'] = 0
            self.index = dict()
Ejemplo n.º 5
0
    def __init__(self,
                 verbose=True,
                 classifier_type="svm",
                 PATH=None,
                 INIT_METHOD="ZeroInit",
                 selected_instances=None
                 ):

        self.verbose = verbose
        self.n_training_instances = 0
        self.clf_type = classifier_type
        self.class_name_ids = dict()
        self.init_method = INIT_METHOD
        self.selected_instances = selected_instances

        if not os.path.exists(PATH + ".pkl") or \
                not os.path.exists(PATH + "_meta.json") or \
                not os.path.exists(RECOMMENDATION_DATA_DIR + 'Classifier_TAG_NAMES.npy'):
            raise Exception("Classifier not existing in classifiers folder.")

        self.clf = joblib.load(PATH + ".pkl")
        meta = loadFromJson(PATH + "_meta.json")
        self.clf_type = meta['clf_type']
        self.class_name_ids = meta['class_name_ids']
        self.n_training_instances = meta['n_training_instances']
        self.tag_names = load(RECOMMENDATION_DATA_DIR + 'Classifier_TAG_NAMES.npy')
    def load(self):
        try:
            tag_recommendation_data = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json')
            DATABASE = tag_recommendation_data['database']
            CLASSES = tag_recommendation_data['classes']
            self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES)
            self.cbtr.load_recommenders()

        except:
            self.cbtr = None
            logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).")

        try:
            self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_index_stats.json')
            logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix'])
        except Exception, e:
            print e
            self.index_stats = {
                'n_sounds_in_matrix': 0,
            }
Ejemplo n.º 7
0
    def process_tag_recommendation_data(self,
                                        resources_limit=None,
                                        tag_threshold=10,
                                        line_limit=99999999999999,
                                        recompute_all_classes=False,
                                        similarity_metric="cosine"):

        # Process tas file and turn into association matrix and derived files
        database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit)

        print "Loading community detector..."
        cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier")
        print cd

        # Classify existing resources
        resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json')
        instances_ids = resources_tags.keys()
        try:
            resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json')
        except Exception, e:
            resource_class = dict()
def cutTermini(config, setting):
    cutSetting = config.getSetting(setting)
    inputPdb = config.getInputFile(setting,     "pdb")
    cutlog = config.getInputFile(setting,       "cutlog")

    cutPdb = config.getOutputFile(setting,      "out")

    if config.getSetting(setting)['verbose']:
        print("Cut Termini from  " + inputPdb + " and output to " + cutPdb)
    if not config.getSetting(setting)["dryRun"]:
        log = utils.loadFromJson(cutlog)
        residues = log['looseTerminiFront'] + log['looseTerminiBack']
        pdblines = utils.readFileToList(inputPdb)
        utils.cutTerminiAndWriteToPdb(residues,pdblines, cutPdb)
Ejemplo n.º 9
0
    def tas_to_association_matrix(self,
                                  tag_threshold=0,
                                  line_limit=1000000000):

        index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json")
        # Get tags from file
        ts = []
        idx = 0
        n_original_associations = 0
        sound_ids = []
        if self.verbose:
            print "Reading index file (%i entries)..." % len(index.items()),
        for sid, tags in index.items():
            ts += tags
            n_original_associations += len(tags)
            sound_ids.append(sid)

            idx += 1
            if idx > line_limit:
                break

        stats = {
            'n_sounds_in_matrix': len(sound_ids),
            #'biggest_id': max([int(sid) for sid in sound_ids])
        }
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json',
                   stats)
        if self.verbose:
            print "done!"

        # Compute tag ocurrences after loading the file
        tag_occurrences = dict()
        unique_ts = list(set(ts))
        for id, t in enumerate(unique_ts):
            tag_occurrences[t] = ts.count(t)

            if self.verbose:
                sys.stdout.write("\rComputing tag occurrences %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()
        print ""
        tags = []
        tags_ids = []
        for id, t in enumerate(unique_ts):

            if tag_occurrences[t] >= tag_threshold:
                tags.append(t)
                tags_ids.append(id)

            if self.verbose:
                sys.stdout.write("\rFiltering tags %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()

        nTags = len(tags)
        if self.verbose:
            print ""
            print "\tOriginal number of tags: " + str(len(unique_ts))
            print "\tTags after filtering: " + str(nTags)

        # Generate resource-tags dictionary only with filtered tags
        if self.verbose:
            print "Reading file for resources...",
        sys.stdout.flush()
        res_tags = {}
        res_user = {}
        res_tags_no_filt = {}
        idx = 0
        n_filtered_associations = 0
        for sid, stags in index.items():
            resource = sid
            user = None
            assigned_tags = stags
            assigned_tags_filt = list(
                set(assigned_tags).intersection(set(tags)))
            res_tags_no_filt[resource] = assigned_tags
            res_user[resource] = user
            if len(assigned_tags_filt) > 0:
                res_tags[resource] = assigned_tags_filt
                n_filtered_associations += len(assigned_tags_filt)

            idx += 1
            if idx > line_limit:
                break

        resources = res_tags.keys()
        nResources = len(resources)
        resources_ids = range(0, nResources)
        if self.verbose:
            print "done!"

        # Generate assocoation matrix
        if self.verbose:
            print "\tOriginal number of associations: " + str(
                n_original_associations)
            print "\tAssociations after filtering: " + str(
                n_filtered_associations)

        if self.verbose:
            print 'Creating empty array of ' + str(nResources) + ' x ' + str(
                nTags) + '...',
        M = spmatrix.ll_mat(nResources, nTags)
        if self.verbose:
            print 'done!'

        done = 0
        for r_id in resources:
            for t in res_tags[r_id]:
                M[resources.index(r_id), tags.index(t)] = 1
                done += 1
                if self.verbose:
                    sys.stdout.write(
                        "\rGenerating association matrix %.2f%%" %
                        (float(100 * done) / n_filtered_associations))
                    sys.stdout.flush()
        if self.verbose:
            print ""

        # Save data
        if self.verbose:
            print "Saving association matrix, resource ids, tag ids and tag names"

        filename = "FS%.4i%.2i%.2i" % (datetime.today().year,
                                       datetime.today().month,
                                       datetime.today().day)
        M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename +
                     '_ASSOCIATION_MATRIX.mtx')
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',
             resources)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags)
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename +
                   '_RESOURCES_TAGS.json',
                   res_tags,
                   verbose=self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose)

        return filename
Ejemplo n.º 10
0
    def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000):

        index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json")
        # Get tags from file
        ts = []
        idx = 0
        n_original_associations = 0
        sound_ids = []
        if self.verbose:
            print "Reading index file (%i entries)..." % len(index.items()),
        for sid, tags in index.items():
            ts += tags
            n_original_associations += len(tags)
            sound_ids.append(sid)

            idx += 1
            if idx > line_limit:
                break

        stats = {
            'n_sounds_in_matrix': len(sound_ids),
            #'biggest_id': max([int(sid) for sid in sound_ids])
        }
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats)
        if self.verbose:
            print "done!"

        # Compute tag ocurrences after loading the file
        tag_occurrences = dict()
        unique_ts = list(set(ts))
        for id, t in enumerate(unique_ts):
            tag_occurrences[t] = ts.count(t)

            if self.verbose:
                sys.stdout.write("\rComputing tag occurrences %.2f%%"%(float(100*(id+1))/len(unique_ts)))
                sys.stdout.flush()
        print ""
        tags = []
        tags_ids = []
        for id, t in enumerate(unique_ts):

            if tag_occurrences[t] >= tag_threshold:
                tags.append(t)
                tags_ids.append(id)

            if self.verbose:
                sys.stdout.write("\rFiltering tags %.2f%%"%(float(100*(id+1))/len(unique_ts)))
                sys.stdout.flush()

        nTags = len(tags)
        if self.verbose:
            print ""
            print "\tOriginal number of tags: " + str(len(unique_ts))
            print "\tTags after filtering: " + str(nTags)

        # Generate resource-tags dictionary only with filtered tags
        if self.verbose:
            print "Reading file for resources...",
        sys.stdout.flush()
        res_tags = {}
        res_user = {}
        res_tags_no_filt = {}
        idx = 0
        n_filtered_associations = 0
        for sid, stags in index.items():
            resource = sid
            user = None
            assigned_tags = stags
            assigned_tags_filt = list(set(assigned_tags).intersection(set(tags)))
            res_tags_no_filt[resource] = assigned_tags
            res_user[resource] = user
            if len(assigned_tags_filt) > 0:
                res_tags[resource] = assigned_tags_filt
                n_filtered_associations += len(assigned_tags_filt)

            idx += 1
            if idx > line_limit:
                break

        resources = res_tags.keys()
        nResources = len(resources)
        resources_ids = range(0,nResources)
        if self.verbose:
            print "done!"

        # Generate assocoation matrix
        if self.verbose:
            print "\tOriginal number of associations: " + str(n_original_associations)
            print "\tAssociations after filtering: " + str(n_filtered_associations)

        if self.verbose:
            print 'Creating empty array of ' + str(nResources) + ' x ' + str(nTags) + '...',
        M = spmatrix.ll_mat(nResources, nTags)
        if self.verbose:
            print 'done!'

        done = 0
        for r_id in resources:
            for t in res_tags[r_id]:
                M[resources.index(r_id),tags.index(t)] = 1
                done += 1
                if self.verbose:
                    sys.stdout.write("\rGenerating association matrix %.2f%%" % (float(100*done)/n_filtered_associations))
                    sys.stdout.flush()
        if self.verbose:
            print ""

        # Save data
        if self.verbose:
            print "Saving association matrix, resource ids, tag ids and tag names"

        filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day)
        M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx')
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',resources)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy',tags_ids)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy',tags)
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json',res_tags, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose)

        return filename
Ejemplo n.º 11
0
    def process_tag_recommendation_data(self,
                                        resources_limit=None,
                                        tag_threshold=10,
                                        line_limit=99999999999999,
                                        recompute_all_classes=False,
                                        similarity_metric="cosine"):

        # Process tas file and turn into association matrix and derived files
        database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit)

        print "Loading community detector..."
        cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier")
        print cd

        # Classify existing resources
        resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json')
        instances_ids = resources_tags.keys()
        try:
            resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json')
        except Exception as e:
            resource_class = dict()

        for count, id in enumerate(instances_ids):
            if not recompute_all_classes:
                if id not in resource_class:
                    resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id])
            else:
                resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id])

            if self.verbose:
                sys.stdout.write("\rClassifying resources... %.2f%%"%(float(100*(count+1))/len(instances_ids)))
                sys.stdout.flush()

        print ""
        saveToJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json', resource_class)
        print ""

        print "\nComputing data for general recommender..."
        self.association_matrix_to_similarity_matrix(
            dataset=database_name,
            training_set=instances_ids[0:resources_limit],
            save_sim=True,
            is_general_recommender=True,
            metric=similarity_metric,
        )

        print "\nComputing data for class recommenders..."
        instance_id_class = []
        distinct_classes = []
        for count, instance_id in enumerate(instances_ids):
            class_id = resource_class[instance_id]
            instance_id_class.append([instance_id, class_id])

            if class_id not in distinct_classes:
                distinct_classes.append(class_id)

        print distinct_classes

        for collection_id in distinct_classes:
            print "\nComputing recommender for collection %s..." % collection_id

            # All resources from the training set classified as the selected category
            # (instead of all manually labeled)
            training_ids = []
            for instance in instance_id_class:
                if instance[1] == collection_id:
                    training_ids.append(instance[0])
            # Add limit
            training_ids = training_ids[0:resources_limit]

            if len(training_ids) < 1:
                raise Exception("Too less training ids for collection %s" % collection_id)

            self.association_matrix_to_similarity_matrix(
                dataset=database_name,
                training_set=training_ids,
                save_sim=True,
                out_name_prefix=collection_id,
                is_general_recommender=False,
                metric=similarity_metric,
            )