def modeEvalFunction(config,setting):
    pdb_bound = config.getInputFile(setting,'protein_bound')
    pdb_unbound = config.getInputFile(setting,'protein_unbound')
    mode_file = config.getInputFile(setting,'mode_file')
    
    output = config.getOutputFile(setting,'out')

    bound_list = utils.readFileToList(pdb_bound)
    unbound_list = utils.readFileToList(pdb_unbound)

    unbound_residues = utils.getResidueFromPDBlines(unbound_list)
     
    bound_CA = utils.getCAOnlyFromPDBLines(bound_list)
    unbound_CA = utils.getCAOnlyFromPDBLines(unbound_list)

    bound_CA_pos = utils.getCoordinatesFromPDBlines(bound_CA)
    unbound_CA_pos = utils.getCoordinatesFromPDBlines(unbound_CA)

    modes = utils.read_modes(mode_file)
    cumulative_overlap = 0
    eval_dict = {}
    for modeIdx, mode in modes.items(): 
        ca_modes = utils.getCAModes(unbound_residues,mode['evec'])
        overlap = utils.getOverlap (unbound_CA_pos,bound_CA_pos, ca_modes)
        cumulative_overlap += overlap**2
        contributionCA = utils.getModeContribution(bound_CA_pos - unbound_CA_pos, ca_modes)
        norm = utils.getModeNorm(mode['evec'])
        contribution = contributionCA * norm
        magnitude = utils.getModeMagnitude(ca_modes)
        maximaIndices = utils.getIndexMaxima(magnitude)
        maxima = magnitude[maximaIndices]
        eval_dict[modeIdx] = { 'overlap':overlap, 'cum_overlap': np.sqrt(cumulative_overlap),'eigenvalue':mode['eval'],'norm':norm,'contribution':contribution,'contribution_ca':contributionCA,'maxima_indices':maximaIndices.tolist(), 'maxima_values':maxima.tolist() }
    
    
    utils.saveToJson(output, {'bound':pdb_bound, 'unbound':pdb_unbound, 'mode_file':mode_file, 'modes': eval_dict})
Ejemplo n.º 2
0
    def add_to_index(self, sound_ids, sound_tagss):
        sound_ids = sound_ids[0].split(",")
        sound_tags = [
            stags.split(",") for stags in sound_tagss[0].split("-!-!-")
        ]
        logger.info('Adding %i sounds to recommendation index' %
                    len(sound_ids))

        for count, sound_id in enumerate(sound_ids):
            sid = sound_id
            stags = sound_tags[count]
            self.index[sid] = stags

        if len(self.index.keys()) % 1000 == 0:
            # Every 1000 indexed sounds, save the index
            logger.info('Saving tagrecommendation index...')
            saveToJson(RECOMMENDATION_DATA_DIR + 'Index.json',
                       self.index,
                       verbose=False)
            self.index_stats['biggest_id_in_index'] = max(
                [int(key) for key in self.index.keys()])
            self.index_stats['n_sounds_in_index'] = len(self.index.keys())

        result = {'error': False, 'result': True}
        return json.dumps(result)
Ejemplo n.º 3
0
    def clear_temp_files(self):

        new_data = False
        for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR):
            if "SIMILARITY_MATRIX" in filename and "SUBSET" in filename:
                new_data = True
                break

        if not new_data:
            raise Exception(
                "There is no new matrix data to update the tag recommendation system"
            )

        for filename in os.listdir(RECOMMENDATION_DATA_DIR):
            file_extension = filename.split(".")[-1]
            if file_extension in ['npy', 'json', 'pkl']:
                if "Classifier" not in filename and "Index" not in filename:  # Do not alter Classifier files
                    if filename[0:6] == "backup":
                        # Delete old backups
                        print "Removing %s" % RECOMMENDATION_DATA_DIR + filename
                        os.remove(RECOMMENDATION_DATA_DIR + filename)
                    else:
                        # Set previous matrixs to "backup mode" (will be deleted in the next update)
                        print "Setting to backup %s" % RECOMMENDATION_DATA_DIR + filename
                        os.rename(
                            RECOMMENDATION_DATA_DIR + filename,
                            RECOMMENDATION_DATA_DIR + "backup_" + filename)

        current_database_name = ""
        class_names = []
        for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR):
            file_extension = filename.split(".")[-1]
            if "Index" not in filename:
                if ("SIMILARITY_MATRIX" in filename
                        and "SUBSET" in filename) or "stats" in filename:
                    # Move similarity matrix to recommendation data dir
                    print "Moving %s" % RECOMMENDATION_TMP_DATA_DIR + filename
                    os.rename(RECOMMENDATION_TMP_DATA_DIR + filename,
                              RECOMMENDATION_DATA_DIR + filename)
                    if "stats" not in filename:
                        current_database_name = filename.split("_")[0]
                        class_names.append(filename.split("_")[1])
                else:
                    # Remove remeaining files in tmp dir (except for the tas file)
                    print "Clearing %s" % RECOMMENDATION_TMP_DATA_DIR + filename
                    os.remove(RECOMMENDATION_TMP_DATA_DIR + filename)

        class_names = list(set(class_names))
        saveToJson(
            RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json',
            {
                'database': current_database_name,
                'classes': class_names
            })
        # Reload tag recommendation server
        urllib.urlopen('http://%s:%i/tagrecommendation/reload' %
                       (TAGRECOMMENDATION_ADDRESS, TAGRECOMMENDATION_PORT))
def FindTermini(config, setting):
    cutSetting = config.getSetting(setting)
    inputPdb = config.getInputFile(setting,                 "pdb")
    looseTerminiLog = config.getOutputFile(setting,         "out")
    cutoff =                        cutSetting[             'cutoff']

    if config.getSetting(setting)['verbose']:
        print("Find Termini from  " + inputPdb)
    if not config.getSetting(setting)["dryRun"]:
        # log = utils.findAndCutLooseTermini(inputPdb, cutPdb, cutoff)
        log = utils.FindLooseTermini(inputPdb, cutoff=cutoff)
        log['cutoff'] = cutoff
        utils.saveToJson(looseTerminiLog, log)
def evaluateModeDOFS(config, setting):
    input_dof_file = config.getInputFile(setting, "input_dof")
    mode_evaluation_rec = config.getInputFile(setting, "mode_evaluation_rec")
    mode_evaluation_lig = config.getInputFile(setting, "mode_evaluation_lig")
    output = config.getOutputFile(setting, "out")

    dof_eval_settings = config.getSetting(setting)
    num_eval = dof_eval_settings['num_eval']
    numModesRec = dof_eval_settings['numModesRec']
    numModesLig = dof_eval_settings['numModesLig']

    if config.getSetting(setting)['verbose']:
        print("SETTING: ", setting.upper(), " evaluating dofs for",
              input_dof_file, ' and output to ', output)
    if not config.getSetting(setting)["dryRun"]:
        dof_dict = utils.read_Dof(input_dof_file)
        sorted_keys = np.sort(np.asarray(list(dof_dict.keys()), dtype=np.int))

        contributions_rec = {}
        for key, val in json.load(open(mode_evaluation_rec,
                                       'r'))['modes'].items():
            contributions_rec[int(key)] = val['contribution']

        contributions_lig = {}
        for key, val in json.load(open(mode_evaluation_lig,
                                       'r'))['modes'].items():
            contributions_lig[int(key)] = val['contribution']

        result = {}
        for key in sorted_keys[:num_eval]:
            dof = dof_dict[key]
            modes_rec = dof['rec'][6:]
            modes_lig = dof['lig'][6:]
            rec = {}
            lig = {}
            for i, mode in enumerate(modes_rec):
                rec[str(i + 1)] = {
                    'ratio': np.float64(mode) / contributions_rec[i + 1] - 1,
                    'dof': mode,
                    'mode': contributions_rec[i + 1]
                }
            for i, mode in enumerate(modes_lig):
                lig[str(i + 1)] = {
                    'ratio': np.float64(mode) / contributions_lig[i + 1] - 1,
                    'dof': mode,
                    'mode': contributions_lig[i + 1]
                }
            result[str(key)] = {'rec': rec, 'lig': lig}
        utils.saveToJson(filename=output, data=result)
Ejemplo n.º 6
0
    def clear_temp_files(self):

        new_data = False
        for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR):
            if "SIMILARITY_MATRIX" in filename and "SUBSET" in filename:
                new_data = True
                break

        if not new_data:
            raise Exception("There is no new matrix data to update the tag recommendation system")

        for filename in os.listdir(RECOMMENDATION_DATA_DIR):
            file_extension = filename.split(".")[-1]
            if file_extension in ['npy', 'json', 'pkl']:
                if "Classifier" not in filename and "Index" not in filename:  # Do not alter Classifier files
                    if filename[0:6] == "backup":
                        # Delete old backups
                        print "Removing %s" % RECOMMENDATION_DATA_DIR + filename
                        os.remove(RECOMMENDATION_DATA_DIR + filename)
                    else:
                        # Set previous matrixs to "backup mode" (will be deleted in the next update)
                        print "Setting to backup %s" % RECOMMENDATION_DATA_DIR + filename
                        os.rename(RECOMMENDATION_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + "backup_" + filename)

        current_database_name = ""
        class_names = []
        for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR):
            file_extension = filename.split(".")[-1]
            if "Index" not in filename:
                if ("SIMILARITY_MATRIX" in filename and "SUBSET" in filename) or "stats" in filename:
                    # Move similarity matrix to recommendation data dir
                    print "Moving %s" % RECOMMENDATION_TMP_DATA_DIR + filename
                    os.rename(RECOMMENDATION_TMP_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + filename)
                    if "stats" not in filename:
                        current_database_name = filename.split("_")[0]
                        class_names.append(filename.split("_")[1])
                else:
                    # Remove remeaining files in tmp dir (except for the tas file)
                    print "Clearing %s" % RECOMMENDATION_TMP_DATA_DIR + filename
                    os.remove(RECOMMENDATION_TMP_DATA_DIR + filename)

        class_names = list(set(class_names))
        saveToJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json', {'database': current_database_name, 'classes':class_names})
        # Reload tag recommendation server
        urllib.urlopen('http://%s:%i/tagrecommendation/reload' % (TAGRECOMMENDATION_ADDRESS, TAGRECOMMENDATION_PORT))
    def add_to_index(self, sound_ids, sound_tagss):
        sound_ids = sound_ids[0].split(",")
        sound_tags = [stags.split(",") for stags in sound_tagss[0].split("-!-!-")]
        logger.info('Adding %i sounds to recommendation index' % len(sound_ids))

        for count, sound_id in enumerate(sound_ids):
            sid = sound_id
            stags = sound_tags[count]
            self.index[sid] = stags

        if len(self.index.keys()) % 1000 == 0:
            # Every 1000 indexed sounds, save the index
            logger.info('Saving tagrecommendation index...')
            saveToJson(RECOMMENDATION_DATA_DIR + 'Index.json', self.index, verbose=False)
            self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()])
            self.index_stats['n_sounds_in_index'] = len(self.index.keys())

        result = {'error': False, 'result': True}
        return json.dumps(result)
def GetInterface(config, setting):
    #receptor = config.getInputFile(setting, 'receptor')
    #ligand = config.getInputFile(setting,'ligand')
    pdb = config.getInputFile(setting,              'pdb')
    interfaceFile = config.getOutputFile(setting,   'out')
    cutoff = config.getSetting(setting)['cutoff']

    if config.getSetting(setting)['verbose']:
        print("Get interface from pdb " + pdb )
    if not config.getSetting(setting)["dryRun"]:
        structures = utils.parseBIOPdbToStructure(pdb)
        interfaces = []
        for struct in structures:
            receptor =  struct['A']       
            ligand =    struct['B']                
        
            contactResiduesRec, contactResiduesLig = utils.getInterfaceResidues(receptor,ligand,cutoff )
            recinterfaceResidues = utils.getResidueIds(contactResiduesRec)
            liginterfaceResidues = utils.getResidueIds(contactResiduesLig)

            interfaces.append({'model': struct.id,"recInterfaceResidues": recinterfaceResidues,"ligInterfaceResidues": liginterfaceResidues})

        utils.saveToJson(interfaceFile, {'file': pdb, 'cutoff': cutoff,'interfaces': interfaces})
def evalProtein(config, setting):
    secondary_file = config.getInputFile(setting, 'secondary')
    output = config.getOutputFile(setting, 'out')

    if config.getSetting(setting)['verbose']:
        print("SETTING: ", setting.upper(), " evaluating protein for",
              secondary_file)
    if not config.getSetting(setting)["dryRun"]:
        secLines = utils.getSecLines(utils.readFileToList(secondary_file))
        area = 0
        secondary, aminoAcids = [], []
        aa_area = {
            'LYS': 0,
            'PRO': 0,
            'ILE': 0,
            'TRP': 0,
            'GLU': 0,
            'GLN': 0,
            'GLY': 0,
            'SER': 0,
            'PHE': 0,
            'HIS': 0,
            'TYR': 0,
            'LEU': 0,
            'ASP': 0,
            'ASN': 0,
            'ARG': 0,
            'THR': 0,
            'ALA': 0,
            'CYS': 0,
            'VAL': 0,
            'MET': 0
        }
        sec_area = {'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0}
        for line in secLines:
            a = float(line[9])
            area += a
            secondary.append(line[5])
            aminoAcids.append(line[1])
            aa_area[line[1]] += a
            sec_area[line[5]] += a

        area = np.asarray(area).sum()
        aa = {
            'LYS': 0,
            'PRO': 0,
            'ILE': 0,
            'TRP': 0,
            'GLU': 0,
            'GLN': 0,
            'GLY': 0,
            'SER': 0,
            'PHE': 0,
            'HIS': 0,
            'TYR': 0,
            'LEU': 0,
            'ASP': 0,
            'ASN': 0,
            'ARG': 0,
            'THR': 0,
            'ALA': 0,
            'CYS': 0,
            'VAL': 0,
            'MET': 0
        }
        sec = {'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0}

        size = float(len(secondary))
        for key, val in Counter(secondary).items():
            sec[key] = val / size
        for key, val in Counter(aminoAcids).items():
            aa[key] = val / size

        for key in aa_area.keys():
            aa_area[key] /= area
        for key in sec_area.keys():
            sec_area[key] /= area
        utils.saveToJson(
            output, {
                'secondary': sec,
                'aminoAcids': aa,
                'area': area,
                'size': size,
                'sec_area': sec_area,
                'aa_area': aa_area
            })
def GetInterface(config, setting):
    pdb = config.getInputFile(setting, 'pdb')
    interfaceFile = config.getOutputFile(setting, 'out')
    receptor_filename = config.getInputFile(setting, 'receptor')
    ligand_filename = config.getInputFile(setting, 'ligand')
    receptorSec_filename = config.getInputFile(setting, 'receptorSec')
    ligandSec_filename = config.getInputFile(setting, 'ligandSec')

    cutoff = config.getSetting(setting)['cutoff']

    if config.getSetting(setting)['verbose']:
        print("SETTING: ", setting.upper(), " Get interface from pdb " + pdb)
    if not config.getSetting(setting)["dryRun"]:
        try:
            receptorSec = utils.getSecLines(
                utils.readFileToList(receptorSec_filename))
            ligandSec = utils.getSecLines(
                utils.readFileToList(ligandSec_filename))

            recmap = utils.getUniqueResIds(
                utils.getResidueFromPDBlines(
                    utils.readFileToList(receptor_filename)))
            ligmap = utils.getUniqueResIds(
                utils.getResidueFromPDBlines(
                    utils.readFileToList(ligand_filename)))
            structures = utils.parseBIOPdbToStructure(pdb)

            interfaces = []
            if len(structures) > 0:
                for struct in structures:
                    receptor = struct['A']
                    ligand = struct['B']

                    contactResiduesRec, contactResiduesLig = utils.getInterfaceResidues(
                        receptor, ligand, cutoff)
                    if len(contactResiduesRec) > 0 or len(
                            contactResiduesLig) > 0:
                        recinterfaceResidues = utils.getResidueIds(
                            contactResiduesRec)
                        liginterfaceResidues = utils.getResidueIds(
                            contactResiduesLig)

                        interfacePosRec = utils.getResidueCoordinates(
                            contactResiduesRec).T
                        interfacePosLig = utils.getResidueCoordinates(
                            contactResiduesLig).T

                        interfaceRecIndices = [
                            recmap[key] for key in recinterfaceResidues
                        ]
                        interfaceLigIndices = [
                            ligmap[key] for key in liginterfaceResidues
                        ]

                        isecRec = []
                        AARec = []
                        areaRec = 0
                        for i in interfaceRecIndices:
                            line = receptorSec[i]
                            isecRec.append(line[5])
                            AARec.append(line[1])
                            areaRec += float(line[9])

                        isecLig = []
                        AALig = []
                        areaLig = 0
                        for i in interfaceLigIndices:
                            line = ligandSec[i]
                            isecLig.append(line[5])
                            AALig.append(line[1])
                            areaLig += float(line[9])

                        AARecCount = {
                            'LYS': 0,
                            'PRO': 0,
                            'ILE': 0,
                            'TRP': 0,
                            'GLU': 0,
                            'GLN': 0,
                            'GLY': 0,
                            'SER': 0,
                            'PHE': 0,
                            'HIS': 0,
                            'TYR': 0,
                            'LEU': 0,
                            'ASP': 0,
                            'ASN': 0,
                            'ARG': 0,
                            'THR': 0,
                            'ALA': 0,
                            'CYS': 0,
                            'VAL': 0,
                            'MET': 0
                        }
                        aalen = float(len(AARec))
                        for key, value in Counter(AARec).items():
                            AARecCount[key] = value / aalen

                        AALigCount = {
                            'LYS': 0,
                            'PRO': 0,
                            'ILE': 0,
                            'TRP': 0,
                            'GLU': 0,
                            'GLN': 0,
                            'GLY': 0,
                            'SER': 0,
                            'PHE': 0,
                            'HIS': 0,
                            'TYR': 0,
                            'LEU': 0,
                            'ASP': 0,
                            'ASN': 0,
                            'ARG': 0,
                            'THR': 0,
                            'ALA': 0,
                            'CYS': 0,
                            'VAL': 0,
                            'MET': 0
                        }
                        aalen = float(len(AALig))
                        for key, value in Counter(AALig).items():
                            AALigCount[key] = value / aalen

                        countSecRec = {
                            'C': 0,
                            'E': 0,
                            'B': 0,
                            'T': 0,
                            'H': 0,
                            'G': 0,
                            'b': 0
                        }
                        lenSec = float(len(isecRec))
                        for key, value in Counter(isecRec).items():
                            countSecRec[key] = value / lenSec

                        countSecLig = {
                            'C': 0,
                            'E': 0,
                            'B': 0,
                            'T': 0,
                            'H': 0,
                            'G': 0,
                            'b': 0
                        }
                        lenSec = float(len(isecLig))
                        for key, value in Counter(isecLig).items():
                            countSecLig[key] = value / lenSec

                        interfaces.append({
                            'model': struct.id,
                            "recInterfaceResidues": recinterfaceResidues,
                            "ligInterfaceResidues": liginterfaceResidues,
                            "recAA": AARec,
                            'ligAA': AALig,
                            'rec_x': list(interfacePosRec[0]),
                            'rec_y': list(interfacePosRec[1]),
                            'rec_z': list(interfacePosRec[2]),
                            'lig_x': list(interfacePosLig[0]),
                            'lig_y': list(interfacePosLig[1]),
                            'lig_z': list(interfacePosLig[2]),
                            'rec_sec': isecRec,
                            'lig_sec': isecLig,
                            'countSecRec': countSecRec,
                            'countSecLig': countSecLig,
                            'AALigCount': AALigCount,
                            'AARecCount': AARecCount,
                            'areaRec': areaRec,
                            'areaLig': areaLig
                        })

                    utils.saveToJson(interfaceFile, {
                        'file': pdb,
                        'cutoff': cutoff,
                        'interfaces': interfaces
                    })
        except:
            print("eval interface: FAILED", interfaceFile)
            pass
def modeEvalFunction(config, setting):
    pdb_bound = config.getInputFile(setting, 'protein_bound')
    pdb_unbound = config.getInputFile(setting, 'protein_unbound')
    mode_file = config.getInputFile(setting, 'mode_file')
    secondary_file = config.getInputFile(setting, 'secondary')

    output = config.getOutputFile(setting, 'out')

    if config.getSetting(setting)['verbose']:
        print("SETTING: ", setting.upper(), " evaluating modes for", mode_file,
              " and output to ", output)
    if not config.getSetting(setting)["dryRun"]:
        try:
            bound_list = utils.readFileToList(pdb_bound)
            unbound_list = utils.readFileToList(pdb_unbound)

            secondary = [
                line[5] for line in utils.getSecLines(
                    utils.readFileToList(secondary_file))
            ]

            currid = None
            #resMap = {}
            indices = []
            count = 0
            for rid in utils.getResidueFromPDBlines(unbound_list):
                if rid != currid:
                    #       resMap[rid] = count
                    indices.append(count)
                    currid = rid
                count += 1
            #resMap = utils.getUniqueResIds(utils.getResidueFromPDBlines(unbound_list))
            # indices = list(resMap.values())
            # indices.sort()

            #print(indices)

            bound_CA = utils.getCAOnlyFromPDBLines(bound_list)
            unbound_CA = utils.getCAOnlyFromPDBLines(unbound_list)
            unbound_residues = utils.getResidueNamesFromPDBlines(unbound_CA)

            bound_CA_pos = utils.getCoordinatesFromPDBlines(bound_CA)
            unbound_CA_pos = utils.getCoordinatesFromPDBlines(unbound_CA)

            modes = utils.read_modes(mode_file)
            cumulative_overlap = 0
            eval_dict = {}
            for modeIdx, mode in modes.items():
                #ca_modes = utils.getCAModes(unbound_residues,mode['evec'])
                ca_modes = [mode['evec'][idx] for idx in indices]
                area_aa = {
                    'LYS': 0,
                    'PRO': 0,
                    'ILE': 0,
                    'TRP': 0,
                    'GLU': 0,
                    'GLN': 0,
                    'GLY': 0,
                    'SER': 0,
                    'PHE': 0,
                    'HIS': 0,
                    'TYR': 0,
                    'LEU': 0,
                    'ASP': 0,
                    'ASN': 0,
                    'ARG': 0,
                    'THR': 0,
                    'ALA': 0,
                    'CYS': 0,
                    'VAL': 0,
                    'MET': 0
                }
                area_sec = {
                    'C': 0,
                    'E': 0,
                    'B': 0,
                    'T': 0,
                    'H': 0,
                    'G': 0,
                    'b': 0
                }
                integral = 0
                for i, vec in enumerate(ca_modes):
                    ampl = vec[0]**2 + vec[1]**2 + vec[2]**2
                    integral += ampl
                    area_aa[unbound_residues[i]] += ampl
                    area_sec[secondary[i]] += ampl

                for key in area_aa.keys():
                    area_aa[key] /= integral
                for key in area_sec.keys():
                    area_sec[key] /= integral

                overlap = utils.getOverlap(unbound_CA_pos, bound_CA_pos,
                                           ca_modes)
                cumulative_overlap += overlap**2
                contributionCA = utils.getModeContribution(
                    bound_CA_pos - unbound_CA_pos, ca_modes).tolist()
                norm = utils.getModeNorm(mode['evec'])
                contribution = contributionCA * norm
                magnitude = utils.getModeMagnitude(ca_modes)
                maximaIndices = utils.getIndexMaxima(magnitude)
                maxima = magnitude[maximaIndices]
                eval_dict[modeIdx] = {
                    'overlap': overlap,
                    'cum_overlap': np.sqrt(cumulative_overlap),
                    'eigenvalue': mode['eval'],
                    'norm': norm,
                    'contribution': contribution,
                    'contribution_ca': contributionCA,
                    'maxima_indices': maximaIndices.tolist(),
                    'maxima_values': maxima.tolist(),
                    'area_aa': area_aa,
                    'area_sec': area_sec
                }

            utils.saveToJson(
                output, {
                    'bound': pdb_bound,
                    'unbound': pdb_unbound,
                    'mode_file': mode_file,
                    'modes': eval_dict
                })
        except:
            print("filed to evaluate protein", pdb_unbound)
            pass
Ejemplo n.º 12
0
    def tas_to_association_matrix(self,
                                  tag_threshold=0,
                                  line_limit=1000000000):

        index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json")
        # Get tags from file
        ts = []
        idx = 0
        n_original_associations = 0
        sound_ids = []
        if self.verbose:
            print "Reading index file (%i entries)..." % len(index.items()),
        for sid, tags in index.items():
            ts += tags
            n_original_associations += len(tags)
            sound_ids.append(sid)

            idx += 1
            if idx > line_limit:
                break

        stats = {
            'n_sounds_in_matrix': len(sound_ids),
            #'biggest_id': max([int(sid) for sid in sound_ids])
        }
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json',
                   stats)
        if self.verbose:
            print "done!"

        # Compute tag ocurrences after loading the file
        tag_occurrences = dict()
        unique_ts = list(set(ts))
        for id, t in enumerate(unique_ts):
            tag_occurrences[t] = ts.count(t)

            if self.verbose:
                sys.stdout.write("\rComputing tag occurrences %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()
        print ""
        tags = []
        tags_ids = []
        for id, t in enumerate(unique_ts):

            if tag_occurrences[t] >= tag_threshold:
                tags.append(t)
                tags_ids.append(id)

            if self.verbose:
                sys.stdout.write("\rFiltering tags %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()

        nTags = len(tags)
        if self.verbose:
            print ""
            print "\tOriginal number of tags: " + str(len(unique_ts))
            print "\tTags after filtering: " + str(nTags)

        # Generate resource-tags dictionary only with filtered tags
        if self.verbose:
            print "Reading file for resources...",
        sys.stdout.flush()
        res_tags = {}
        res_user = {}
        res_tags_no_filt = {}
        idx = 0
        n_filtered_associations = 0
        for sid, stags in index.items():
            resource = sid
            user = None
            assigned_tags = stags
            assigned_tags_filt = list(
                set(assigned_tags).intersection(set(tags)))
            res_tags_no_filt[resource] = assigned_tags
            res_user[resource] = user
            if len(assigned_tags_filt) > 0:
                res_tags[resource] = assigned_tags_filt
                n_filtered_associations += len(assigned_tags_filt)

            idx += 1
            if idx > line_limit:
                break

        resources = res_tags.keys()
        nResources = len(resources)
        resources_ids = range(0, nResources)
        if self.verbose:
            print "done!"

        # Generate assocoation matrix
        if self.verbose:
            print "\tOriginal number of associations: " + str(
                n_original_associations)
            print "\tAssociations after filtering: " + str(
                n_filtered_associations)

        if self.verbose:
            print 'Creating empty array of ' + str(nResources) + ' x ' + str(
                nTags) + '...',
        M = spmatrix.ll_mat(nResources, nTags)
        if self.verbose:
            print 'done!'

        done = 0
        for r_id in resources:
            for t in res_tags[r_id]:
                M[resources.index(r_id), tags.index(t)] = 1
                done += 1
                if self.verbose:
                    sys.stdout.write(
                        "\rGenerating association matrix %.2f%%" %
                        (float(100 * done) / n_filtered_associations))
                    sys.stdout.flush()
        if self.verbose:
            print ""

        # Save data
        if self.verbose:
            print "Saving association matrix, resource ids, tag ids and tag names"

        filename = "FS%.4i%.2i%.2i" % (datetime.today().year,
                                       datetime.today().month,
                                       datetime.today().day)
        M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename +
                     '_ASSOCIATION_MATRIX.mtx')
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',
             resources)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags)
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename +
                   '_RESOURCES_TAGS.json',
                   res_tags,
                   verbose=self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose)

        return filename
Ejemplo n.º 13
0
class RecommendationDataProcessor:
    '''
    This class has methods to generate all the files that the tag recommendation systems needs to recommend tags.
    To generate these files the data processor needs the Index.json file with the tag association information from freesound.
    The Index.json file must have the following form:

    {
        "1142": [
            "glitch",
            "loop",
            "plucked",
            "string"
        ],
        "1143": [
            "glitch",
            "loop",
            "plucked",
            "string"
        ], ...
    }

    The files that are generated by the system are:
    (for every sound class: Soundscape, Music, Fx, Samples, Speech)
    [[DATABASE]]_[[CLASSNAME]]_SIMILARITY_MATRIX_cosine_SUBSET.npy
    [[DATABASE]]_[[CLASSNAME]]_SIMILARITY_MATRIX_cosine_SUBSET_TAG_NAMES.npy
    '''

    verbose = None

    def __init__(self, verbose=True):
        self.verbose = verbose

    def __repr__(self):
        return "RecommendationDataProcessor instance"

    def tas_to_association_matrix(self,
                                  tag_threshold=0,
                                  line_limit=1000000000):

        index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json")
        # Get tags from file
        ts = []
        idx = 0
        n_original_associations = 0
        sound_ids = []
        if self.verbose:
            print "Reading index file (%i entries)..." % len(index.items()),
        for sid, tags in index.items():
            ts += tags
            n_original_associations += len(tags)
            sound_ids.append(sid)

            idx += 1
            if idx > line_limit:
                break

        stats = {
            'n_sounds_in_matrix': len(sound_ids),
            #'biggest_id': max([int(sid) for sid in sound_ids])
        }
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json',
                   stats)
        if self.verbose:
            print "done!"

        # Compute tag ocurrences after loading the file
        tag_occurrences = dict()
        unique_ts = list(set(ts))
        for id, t in enumerate(unique_ts):
            tag_occurrences[t] = ts.count(t)

            if self.verbose:
                sys.stdout.write("\rComputing tag occurrences %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()
        print ""
        tags = []
        tags_ids = []
        for id, t in enumerate(unique_ts):

            if tag_occurrences[t] >= tag_threshold:
                tags.append(t)
                tags_ids.append(id)

            if self.verbose:
                sys.stdout.write("\rFiltering tags %.2f%%" %
                                 (float(100 * (id + 1)) / len(unique_ts)))
                sys.stdout.flush()

        nTags = len(tags)
        if self.verbose:
            print ""
            print "\tOriginal number of tags: " + str(len(unique_ts))
            print "\tTags after filtering: " + str(nTags)

        # Generate resource-tags dictionary only with filtered tags
        if self.verbose:
            print "Reading file for resources...",
        sys.stdout.flush()
        res_tags = {}
        res_user = {}
        res_tags_no_filt = {}
        idx = 0
        n_filtered_associations = 0
        for sid, stags in index.items():
            resource = sid
            user = None
            assigned_tags = stags
            assigned_tags_filt = list(
                set(assigned_tags).intersection(set(tags)))
            res_tags_no_filt[resource] = assigned_tags
            res_user[resource] = user
            if len(assigned_tags_filt) > 0:
                res_tags[resource] = assigned_tags_filt
                n_filtered_associations += len(assigned_tags_filt)

            idx += 1
            if idx > line_limit:
                break

        resources = res_tags.keys()
        nResources = len(resources)
        resources_ids = range(0, nResources)
        if self.verbose:
            print "done!"

        # Generate assocoation matrix
        if self.verbose:
            print "\tOriginal number of associations: " + str(
                n_original_associations)
            print "\tAssociations after filtering: " + str(
                n_filtered_associations)

        if self.verbose:
            print 'Creating empty array of ' + str(nResources) + ' x ' + str(
                nTags) + '...',
        M = spmatrix.ll_mat(nResources, nTags)
        if self.verbose:
            print 'done!'

        done = 0
        for r_id in resources:
            for t in res_tags[r_id]:
                M[resources.index(r_id), tags.index(t)] = 1
                done += 1
                if self.verbose:
                    sys.stdout.write(
                        "\rGenerating association matrix %.2f%%" %
                        (float(100 * done) / n_filtered_associations))
                    sys.stdout.flush()
        if self.verbose:
            print ""

        # Save data
        if self.verbose:
            print "Saving association matrix, resource ids, tag ids and tag names"

        filename = "FS%.4i%.2i%.2i" % (datetime.today().year,
                                       datetime.today().month,
                                       datetime.today().day)
        M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename +
                     '_ASSOCIATION_MATRIX.mtx')
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',
             resources)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags)
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename +
                   '_RESOURCES_TAGS.json',
                   res_tags,
                   verbose=self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose)

        return filename

    def association_matrix_to_similarity_matrix(self,
                                                metric="cosine",
                                                dataset="FREESOUND",
                                                save_sim=False,
                                                training_set=None,
                                                out_name_prefix="",
                                                is_general_recommender=False):

        if self.verbose:
            print "Loading association matrix and tag names, ids files..."
        try:
            M = spmatrix.ll_mat_from_mtx(RECOMMENDATION_TMP_DATA_DIR +
                                         dataset + "_ASSOCIATION_MATRIX.mtx")
            resource_ids = load(RECOMMENDATION_TMP_DATA_DIR + dataset +
                                "_RESOURCE_IDS.npy")
            tag_names = load(RECOMMENDATION_TMP_DATA_DIR + dataset +
                             "_TAG_NAMES.npy")
        except Exception:
            raise Exception(
                "Error loading association matrix and tag names, ids data")

        if metric not in ['cosine', 'binary', 'coocurrence', 'jaccard']:
            raise Exception("Wrong similarity metric specified")

        if self.verbose:
            print "Computing similarity matrix from a resource subset of the whole association matrix..."
        # Get index of resources to train (usable index for M)
        resource_id_positions = where(
            in1d(resource_ids, training_set, assume_unique=True))[0]

        # Matrix multiplication (only taking in account resources in training set and ALL tags)
        MM = spmatrix.dot(M[resource_id_positions, :],
                          M[resource_id_positions, :])

        # Get similarity matrix
        sim_matrix = spmatrix.ll_mat(MM.shape[0], MM.shape[0])
        non_zero_index = MM.keys()
        for index in non_zero_index:
            if metric == 'cosine':
                sim_matrix[index[0], index[1]] = MM[index[0], index[1]] * (
                    1 / (sqrt(MM[index[0], index[0]]) *
                         sqrt(MM[index[1], index[1]])))
            elif metric == 'coocurrence':
                sim_matrix[index[0], index[1]] = MM[index[0], index[1]]
            elif metric == 'binary':
                sim_matrix[index[0],
                           index[1]] = MM[index[0], index[1]] / MM[index[0],
                                                                   index[1]]
            elif metric == 'jaccard':
                sim_matrix[index[0], index[1]] = MM[index[0], index[1]] * (
                    1 / (MM[index[0], index[0]] + MM[index[1], index[1]] -
                         MM[index[0], index[1]]))

        # Clean out similarity matrix (clean tags that are not used)
        tag_positions = []
        for i in range(0, sim_matrix.shape[0]):
            if sim_matrix[i, i] != 0.0:
                tag_positions.append(i)

        # Transform sparse similarity matrix to npy format
        sim_matrix_npy = mtx2npy(sim_matrix[tag_positions, tag_positions])
        tag_names_sim_matrix = tag_names[tag_positions]

        if save_sim:
            if not is_general_recommender:
                # Save sim
                path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_%s_SIMILARITY_MATRIX_" % out_name_prefix + metric + "_SUBSET.npy"
                if self.verbose:
                    print "Saving to " + path + "..."
                save(path, sim_matrix_npy)

                # Save tag names
                path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_%s_SIMILARITY_MATRIX_" % out_name_prefix + metric + "_SUBSET_TAG_NAMES.npy"
                if self.verbose:
                    print "Saving to " + path + "..."
                save(path, tag_names_sim_matrix)
            else:
                # Save sim
                path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_SIMILARITY_MATRIX_" + metric + ".npy"
                if self.verbose:
                    print "Saving to " + path + "..."
                save(path, sim_matrix_npy)

                # Save tag names
                path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_SIMILARITY_MATRIX_" + metric + "_TAG_NAMES.npy"
                if self.verbose:
                    print "Saving to " + path + "..."
                save(path, tag_names_sim_matrix)

        return {
            'SIMILARITY_MATRIX': sim_matrix_npy,
            'TAG_NAMES': tag_names_sim_matrix
        }

    def process_tag_recommendation_data(self,
                                        resources_limit=None,
                                        tag_threshold=10,
                                        line_limit=99999999999999,
                                        recompute_all_classes=False,
                                        similarity_metric="cosine"):

        # Process tas file and turn into association matrix and derived files
        database_name = self.tas_to_association_matrix(
            tag_threshold=tag_threshold, line_limit=line_limit)

        print "Loading community detector..."
        cd = CommunityDetector(verbose=False,
                               PATH=RECOMMENDATION_DATA_DIR + "Classifier")
        print cd

        # Classify existing resources
        resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR +
                                      database_name + '_RESOURCES_TAGS.json')
        instances_ids = resources_tags.keys()
        try:
            resource_class = loadFromJson(
                RECOMMENDATION_DATA_DIR +
                'Classifier_classified_resources.json')
        except Exception, e:
            resource_class = dict()

        for count, id in enumerate(instances_ids):
            if not recompute_all_classes:
                if id not in resource_class:
                    resource_class[id] = cd.detectCommunity(
                        input_tags=resources_tags[id])
            else:
                resource_class[id] = cd.detectCommunity(
                    input_tags=resources_tags[id])

            if self.verbose:
                sys.stdout.write("\rClassifying resources... %.2f%%" %
                                 (float(100 *
                                        (count + 1)) / len(instances_ids)))
                sys.stdout.flush()

        print ""
        saveToJson(
            RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json',
            resource_class)
        print ""

        print "\nComputing data for general recommender..."
        self.association_matrix_to_similarity_matrix(
            dataset=database_name,
            training_set=instances_ids[0:resources_limit],
            save_sim=True,
            is_general_recommender=True,
            metric=similarity_metric,
        )

        print "\nComputing data for class recommenders..."
        instance_id_class = []
        distinct_classes = []
        for count, instance_id in enumerate(instances_ids):
            class_id = resource_class[instance_id]
            instance_id_class.append([instance_id, class_id])

            if class_id not in distinct_classes:
                distinct_classes.append(class_id)

        print distinct_classes

        for collection_id in distinct_classes:
            print "\nComputing recommender for collection %s..." % collection_id

            # All resources from the training set classified as the selected category
            # (instead of all manually labeled)
            training_ids = []
            for instance in instance_id_class:
                if instance[1] == collection_id:
                    training_ids.append(instance[0])
            # Add limit
            training_ids = training_ids[0:resources_limit]

            if len(training_ids) < 1:
                raise Exception("Too less training ids for collection %s" %
                                collection_id)

            self.association_matrix_to_similarity_matrix(
                dataset=database_name,
                training_set=training_ids,
                save_sim=True,
                out_name_prefix=collection_id,
                is_general_recommender=False,
                metric=similarity_metric,
            )
Ejemplo n.º 14
0
    def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000):

        index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json")
        # Get tags from file
        ts = []
        idx = 0
        n_original_associations = 0
        sound_ids = []
        if self.verbose:
            print "Reading index file (%i entries)..." % len(index.items()),
        for sid, tags in index.items():
            ts += tags
            n_original_associations += len(tags)
            sound_ids.append(sid)

            idx += 1
            if idx > line_limit:
                break

        stats = {
            'n_sounds_in_matrix': len(sound_ids),
            #'biggest_id': max([int(sid) for sid in sound_ids])
        }
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats)
        if self.verbose:
            print "done!"

        # Compute tag ocurrences after loading the file
        tag_occurrences = dict()
        unique_ts = list(set(ts))
        for id, t in enumerate(unique_ts):
            tag_occurrences[t] = ts.count(t)

            if self.verbose:
                sys.stdout.write("\rComputing tag occurrences %.2f%%"%(float(100*(id+1))/len(unique_ts)))
                sys.stdout.flush()
        print ""
        tags = []
        tags_ids = []
        for id, t in enumerate(unique_ts):

            if tag_occurrences[t] >= tag_threshold:
                tags.append(t)
                tags_ids.append(id)

            if self.verbose:
                sys.stdout.write("\rFiltering tags %.2f%%"%(float(100*(id+1))/len(unique_ts)))
                sys.stdout.flush()

        nTags = len(tags)
        if self.verbose:
            print ""
            print "\tOriginal number of tags: " + str(len(unique_ts))
            print "\tTags after filtering: " + str(nTags)

        # Generate resource-tags dictionary only with filtered tags
        if self.verbose:
            print "Reading file for resources...",
        sys.stdout.flush()
        res_tags = {}
        res_user = {}
        res_tags_no_filt = {}
        idx = 0
        n_filtered_associations = 0
        for sid, stags in index.items():
            resource = sid
            user = None
            assigned_tags = stags
            assigned_tags_filt = list(set(assigned_tags).intersection(set(tags)))
            res_tags_no_filt[resource] = assigned_tags
            res_user[resource] = user
            if len(assigned_tags_filt) > 0:
                res_tags[resource] = assigned_tags_filt
                n_filtered_associations += len(assigned_tags_filt)

            idx += 1
            if idx > line_limit:
                break

        resources = res_tags.keys()
        nResources = len(resources)
        resources_ids = range(0,nResources)
        if self.verbose:
            print "done!"

        # Generate assocoation matrix
        if self.verbose:
            print "\tOriginal number of associations: " + str(n_original_associations)
            print "\tAssociations after filtering: " + str(n_filtered_associations)

        if self.verbose:
            print 'Creating empty array of ' + str(nResources) + ' x ' + str(nTags) + '...',
        M = spmatrix.ll_mat(nResources, nTags)
        if self.verbose:
            print 'done!'

        done = 0
        for r_id in resources:
            for t in res_tags[r_id]:
                M[resources.index(r_id),tags.index(t)] = 1
                done += 1
                if self.verbose:
                    sys.stdout.write("\rGenerating association matrix %.2f%%" % (float(100*done)/n_filtered_associations))
                    sys.stdout.flush()
        if self.verbose:
            print ""

        # Save data
        if self.verbose:
            print "Saving association matrix, resource ids, tag ids and tag names"

        filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day)
        M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx')
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',resources)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy',tags_ids)
        save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy',tags)
        saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json',res_tags, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose)
        #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose)

        return filename
Ejemplo n.º 15
0
    def process_tag_recommendation_data(self,
                                        resources_limit=None,
                                        tag_threshold=10,
                                        line_limit=99999999999999,
                                        recompute_all_classes=False,
                                        similarity_metric="cosine"):

        # Process tas file and turn into association matrix and derived files
        database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit)

        print "Loading community detector..."
        cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier")
        print cd

        # Classify existing resources
        resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json')
        instances_ids = resources_tags.keys()
        try:
            resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json')
        except Exception as e:
            resource_class = dict()

        for count, id in enumerate(instances_ids):
            if not recompute_all_classes:
                if id not in resource_class:
                    resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id])
            else:
                resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id])

            if self.verbose:
                sys.stdout.write("\rClassifying resources... %.2f%%"%(float(100*(count+1))/len(instances_ids)))
                sys.stdout.flush()

        print ""
        saveToJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json', resource_class)
        print ""

        print "\nComputing data for general recommender..."
        self.association_matrix_to_similarity_matrix(
            dataset=database_name,
            training_set=instances_ids[0:resources_limit],
            save_sim=True,
            is_general_recommender=True,
            metric=similarity_metric,
        )

        print "\nComputing data for class recommenders..."
        instance_id_class = []
        distinct_classes = []
        for count, instance_id in enumerate(instances_ids):
            class_id = resource_class[instance_id]
            instance_id_class.append([instance_id, class_id])

            if class_id not in distinct_classes:
                distinct_classes.append(class_id)

        print distinct_classes

        for collection_id in distinct_classes:
            print "\nComputing recommender for collection %s..." % collection_id

            # All resources from the training set classified as the selected category
            # (instead of all manually labeled)
            training_ids = []
            for instance in instance_id_class:
                if instance[1] == collection_id:
                    training_ids.append(instance[0])
            # Add limit
            training_ids = training_ids[0:resources_limit]

            if len(training_ids) < 1:
                raise Exception("Too less training ids for collection %s" % collection_id)

            self.association_matrix_to_similarity_matrix(
                dataset=database_name,
                training_set=training_ids,
                save_sim=True,
                out_name_prefix=collection_id,
                is_general_recommender=False,
                metric=similarity_metric,
            )