def modeEvalFunction(config,setting): pdb_bound = config.getInputFile(setting,'protein_bound') pdb_unbound = config.getInputFile(setting,'protein_unbound') mode_file = config.getInputFile(setting,'mode_file') output = config.getOutputFile(setting,'out') bound_list = utils.readFileToList(pdb_bound) unbound_list = utils.readFileToList(pdb_unbound) unbound_residues = utils.getResidueFromPDBlines(unbound_list) bound_CA = utils.getCAOnlyFromPDBLines(bound_list) unbound_CA = utils.getCAOnlyFromPDBLines(unbound_list) bound_CA_pos = utils.getCoordinatesFromPDBlines(bound_CA) unbound_CA_pos = utils.getCoordinatesFromPDBlines(unbound_CA) modes = utils.read_modes(mode_file) cumulative_overlap = 0 eval_dict = {} for modeIdx, mode in modes.items(): ca_modes = utils.getCAModes(unbound_residues,mode['evec']) overlap = utils.getOverlap (unbound_CA_pos,bound_CA_pos, ca_modes) cumulative_overlap += overlap**2 contributionCA = utils.getModeContribution(bound_CA_pos - unbound_CA_pos, ca_modes) norm = utils.getModeNorm(mode['evec']) contribution = contributionCA * norm magnitude = utils.getModeMagnitude(ca_modes) maximaIndices = utils.getIndexMaxima(magnitude) maxima = magnitude[maximaIndices] eval_dict[modeIdx] = { 'overlap':overlap, 'cum_overlap': np.sqrt(cumulative_overlap),'eigenvalue':mode['eval'],'norm':norm,'contribution':contribution,'contribution_ca':contributionCA,'maxima_indices':maximaIndices.tolist(), 'maxima_values':maxima.tolist() } utils.saveToJson(output, {'bound':pdb_bound, 'unbound':pdb_unbound, 'mode_file':mode_file, 'modes': eval_dict})
def add_to_index(self, sound_ids, sound_tagss): sound_ids = sound_ids[0].split(",") sound_tags = [ stags.split(",") for stags in sound_tagss[0].split("-!-!-") ] logger.info('Adding %i sounds to recommendation index' % len(sound_ids)) for count, sound_id in enumerate(sound_ids): sid = sound_id stags = sound_tags[count] self.index[sid] = stags if len(self.index.keys()) % 1000 == 0: # Every 1000 indexed sounds, save the index logger.info('Saving tagrecommendation index...') saveToJson(RECOMMENDATION_DATA_DIR + 'Index.json', self.index, verbose=False) self.index_stats['biggest_id_in_index'] = max( [int(key) for key in self.index.keys()]) self.index_stats['n_sounds_in_index'] = len(self.index.keys()) result = {'error': False, 'result': True} return json.dumps(result)
def clear_temp_files(self): new_data = False for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR): if "SIMILARITY_MATRIX" in filename and "SUBSET" in filename: new_data = True break if not new_data: raise Exception( "There is no new matrix data to update the tag recommendation system" ) for filename in os.listdir(RECOMMENDATION_DATA_DIR): file_extension = filename.split(".")[-1] if file_extension in ['npy', 'json', 'pkl']: if "Classifier" not in filename and "Index" not in filename: # Do not alter Classifier files if filename[0:6] == "backup": # Delete old backups print "Removing %s" % RECOMMENDATION_DATA_DIR + filename os.remove(RECOMMENDATION_DATA_DIR + filename) else: # Set previous matrixs to "backup mode" (will be deleted in the next update) print "Setting to backup %s" % RECOMMENDATION_DATA_DIR + filename os.rename( RECOMMENDATION_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + "backup_" + filename) current_database_name = "" class_names = [] for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR): file_extension = filename.split(".")[-1] if "Index" not in filename: if ("SIMILARITY_MATRIX" in filename and "SUBSET" in filename) or "stats" in filename: # Move similarity matrix to recommendation data dir print "Moving %s" % RECOMMENDATION_TMP_DATA_DIR + filename os.rename(RECOMMENDATION_TMP_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + filename) if "stats" not in filename: current_database_name = filename.split("_")[0] class_names.append(filename.split("_")[1]) else: # Remove remeaining files in tmp dir (except for the tas file) print "Clearing %s" % RECOMMENDATION_TMP_DATA_DIR + filename os.remove(RECOMMENDATION_TMP_DATA_DIR + filename) class_names = list(set(class_names)) saveToJson( RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json', { 'database': current_database_name, 'classes': class_names }) # Reload tag recommendation server urllib.urlopen('http://%s:%i/tagrecommendation/reload' % (TAGRECOMMENDATION_ADDRESS, TAGRECOMMENDATION_PORT))
def FindTermini(config, setting): cutSetting = config.getSetting(setting) inputPdb = config.getInputFile(setting, "pdb") looseTerminiLog = config.getOutputFile(setting, "out") cutoff = cutSetting[ 'cutoff'] if config.getSetting(setting)['verbose']: print("Find Termini from " + inputPdb) if not config.getSetting(setting)["dryRun"]: # log = utils.findAndCutLooseTermini(inputPdb, cutPdb, cutoff) log = utils.FindLooseTermini(inputPdb, cutoff=cutoff) log['cutoff'] = cutoff utils.saveToJson(looseTerminiLog, log)
def evaluateModeDOFS(config, setting): input_dof_file = config.getInputFile(setting, "input_dof") mode_evaluation_rec = config.getInputFile(setting, "mode_evaluation_rec") mode_evaluation_lig = config.getInputFile(setting, "mode_evaluation_lig") output = config.getOutputFile(setting, "out") dof_eval_settings = config.getSetting(setting) num_eval = dof_eval_settings['num_eval'] numModesRec = dof_eval_settings['numModesRec'] numModesLig = dof_eval_settings['numModesLig'] if config.getSetting(setting)['verbose']: print("SETTING: ", setting.upper(), " evaluating dofs for", input_dof_file, ' and output to ', output) if not config.getSetting(setting)["dryRun"]: dof_dict = utils.read_Dof(input_dof_file) sorted_keys = np.sort(np.asarray(list(dof_dict.keys()), dtype=np.int)) contributions_rec = {} for key, val in json.load(open(mode_evaluation_rec, 'r'))['modes'].items(): contributions_rec[int(key)] = val['contribution'] contributions_lig = {} for key, val in json.load(open(mode_evaluation_lig, 'r'))['modes'].items(): contributions_lig[int(key)] = val['contribution'] result = {} for key in sorted_keys[:num_eval]: dof = dof_dict[key] modes_rec = dof['rec'][6:] modes_lig = dof['lig'][6:] rec = {} lig = {} for i, mode in enumerate(modes_rec): rec[str(i + 1)] = { 'ratio': np.float64(mode) / contributions_rec[i + 1] - 1, 'dof': mode, 'mode': contributions_rec[i + 1] } for i, mode in enumerate(modes_lig): lig[str(i + 1)] = { 'ratio': np.float64(mode) / contributions_lig[i + 1] - 1, 'dof': mode, 'mode': contributions_lig[i + 1] } result[str(key)] = {'rec': rec, 'lig': lig} utils.saveToJson(filename=output, data=result)
def clear_temp_files(self): new_data = False for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR): if "SIMILARITY_MATRIX" in filename and "SUBSET" in filename: new_data = True break if not new_data: raise Exception("There is no new matrix data to update the tag recommendation system") for filename in os.listdir(RECOMMENDATION_DATA_DIR): file_extension = filename.split(".")[-1] if file_extension in ['npy', 'json', 'pkl']: if "Classifier" not in filename and "Index" not in filename: # Do not alter Classifier files if filename[0:6] == "backup": # Delete old backups print "Removing %s" % RECOMMENDATION_DATA_DIR + filename os.remove(RECOMMENDATION_DATA_DIR + filename) else: # Set previous matrixs to "backup mode" (will be deleted in the next update) print "Setting to backup %s" % RECOMMENDATION_DATA_DIR + filename os.rename(RECOMMENDATION_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + "backup_" + filename) current_database_name = "" class_names = [] for filename in os.listdir(RECOMMENDATION_TMP_DATA_DIR): file_extension = filename.split(".")[-1] if "Index" not in filename: if ("SIMILARITY_MATRIX" in filename and "SUBSET" in filename) or "stats" in filename: # Move similarity matrix to recommendation data dir print "Moving %s" % RECOMMENDATION_TMP_DATA_DIR + filename os.rename(RECOMMENDATION_TMP_DATA_DIR + filename, RECOMMENDATION_DATA_DIR + filename) if "stats" not in filename: current_database_name = filename.split("_")[0] class_names.append(filename.split("_")[1]) else: # Remove remeaining files in tmp dir (except for the tas file) print "Clearing %s" % RECOMMENDATION_TMP_DATA_DIR + filename os.remove(RECOMMENDATION_TMP_DATA_DIR + filename) class_names = list(set(class_names)) saveToJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json', {'database': current_database_name, 'classes':class_names}) # Reload tag recommendation server urllib.urlopen('http://%s:%i/tagrecommendation/reload' % (TAGRECOMMENDATION_ADDRESS, TAGRECOMMENDATION_PORT))
def add_to_index(self, sound_ids, sound_tagss): sound_ids = sound_ids[0].split(",") sound_tags = [stags.split(",") for stags in sound_tagss[0].split("-!-!-")] logger.info('Adding %i sounds to recommendation index' % len(sound_ids)) for count, sound_id in enumerate(sound_ids): sid = sound_id stags = sound_tags[count] self.index[sid] = stags if len(self.index.keys()) % 1000 == 0: # Every 1000 indexed sounds, save the index logger.info('Saving tagrecommendation index...') saveToJson(RECOMMENDATION_DATA_DIR + 'Index.json', self.index, verbose=False) self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()]) self.index_stats['n_sounds_in_index'] = len(self.index.keys()) result = {'error': False, 'result': True} return json.dumps(result)
def GetInterface(config, setting): #receptor = config.getInputFile(setting, 'receptor') #ligand = config.getInputFile(setting,'ligand') pdb = config.getInputFile(setting, 'pdb') interfaceFile = config.getOutputFile(setting, 'out') cutoff = config.getSetting(setting)['cutoff'] if config.getSetting(setting)['verbose']: print("Get interface from pdb " + pdb ) if not config.getSetting(setting)["dryRun"]: structures = utils.parseBIOPdbToStructure(pdb) interfaces = [] for struct in structures: receptor = struct['A'] ligand = struct['B'] contactResiduesRec, contactResiduesLig = utils.getInterfaceResidues(receptor,ligand,cutoff ) recinterfaceResidues = utils.getResidueIds(contactResiduesRec) liginterfaceResidues = utils.getResidueIds(contactResiduesLig) interfaces.append({'model': struct.id,"recInterfaceResidues": recinterfaceResidues,"ligInterfaceResidues": liginterfaceResidues}) utils.saveToJson(interfaceFile, {'file': pdb, 'cutoff': cutoff,'interfaces': interfaces})
def evalProtein(config, setting): secondary_file = config.getInputFile(setting, 'secondary') output = config.getOutputFile(setting, 'out') if config.getSetting(setting)['verbose']: print("SETTING: ", setting.upper(), " evaluating protein for", secondary_file) if not config.getSetting(setting)["dryRun"]: secLines = utils.getSecLines(utils.readFileToList(secondary_file)) area = 0 secondary, aminoAcids = [], [] aa_area = { 'LYS': 0, 'PRO': 0, 'ILE': 0, 'TRP': 0, 'GLU': 0, 'GLN': 0, 'GLY': 0, 'SER': 0, 'PHE': 0, 'HIS': 0, 'TYR': 0, 'LEU': 0, 'ASP': 0, 'ASN': 0, 'ARG': 0, 'THR': 0, 'ALA': 0, 'CYS': 0, 'VAL': 0, 'MET': 0 } sec_area = {'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0} for line in secLines: a = float(line[9]) area += a secondary.append(line[5]) aminoAcids.append(line[1]) aa_area[line[1]] += a sec_area[line[5]] += a area = np.asarray(area).sum() aa = { 'LYS': 0, 'PRO': 0, 'ILE': 0, 'TRP': 0, 'GLU': 0, 'GLN': 0, 'GLY': 0, 'SER': 0, 'PHE': 0, 'HIS': 0, 'TYR': 0, 'LEU': 0, 'ASP': 0, 'ASN': 0, 'ARG': 0, 'THR': 0, 'ALA': 0, 'CYS': 0, 'VAL': 0, 'MET': 0 } sec = {'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0} size = float(len(secondary)) for key, val in Counter(secondary).items(): sec[key] = val / size for key, val in Counter(aminoAcids).items(): aa[key] = val / size for key in aa_area.keys(): aa_area[key] /= area for key in sec_area.keys(): sec_area[key] /= area utils.saveToJson( output, { 'secondary': sec, 'aminoAcids': aa, 'area': area, 'size': size, 'sec_area': sec_area, 'aa_area': aa_area })
def GetInterface(config, setting): pdb = config.getInputFile(setting, 'pdb') interfaceFile = config.getOutputFile(setting, 'out') receptor_filename = config.getInputFile(setting, 'receptor') ligand_filename = config.getInputFile(setting, 'ligand') receptorSec_filename = config.getInputFile(setting, 'receptorSec') ligandSec_filename = config.getInputFile(setting, 'ligandSec') cutoff = config.getSetting(setting)['cutoff'] if config.getSetting(setting)['verbose']: print("SETTING: ", setting.upper(), " Get interface from pdb " + pdb) if not config.getSetting(setting)["dryRun"]: try: receptorSec = utils.getSecLines( utils.readFileToList(receptorSec_filename)) ligandSec = utils.getSecLines( utils.readFileToList(ligandSec_filename)) recmap = utils.getUniqueResIds( utils.getResidueFromPDBlines( utils.readFileToList(receptor_filename))) ligmap = utils.getUniqueResIds( utils.getResidueFromPDBlines( utils.readFileToList(ligand_filename))) structures = utils.parseBIOPdbToStructure(pdb) interfaces = [] if len(structures) > 0: for struct in structures: receptor = struct['A'] ligand = struct['B'] contactResiduesRec, contactResiduesLig = utils.getInterfaceResidues( receptor, ligand, cutoff) if len(contactResiduesRec) > 0 or len( contactResiduesLig) > 0: recinterfaceResidues = utils.getResidueIds( contactResiduesRec) liginterfaceResidues = utils.getResidueIds( contactResiduesLig) interfacePosRec = utils.getResidueCoordinates( contactResiduesRec).T interfacePosLig = utils.getResidueCoordinates( contactResiduesLig).T interfaceRecIndices = [ recmap[key] for key in recinterfaceResidues ] interfaceLigIndices = [ ligmap[key] for key in liginterfaceResidues ] isecRec = [] AARec = [] areaRec = 0 for i in interfaceRecIndices: line = receptorSec[i] isecRec.append(line[5]) AARec.append(line[1]) areaRec += float(line[9]) isecLig = [] AALig = [] areaLig = 0 for i in interfaceLigIndices: line = ligandSec[i] isecLig.append(line[5]) AALig.append(line[1]) areaLig += float(line[9]) AARecCount = { 'LYS': 0, 'PRO': 0, 'ILE': 0, 'TRP': 0, 'GLU': 0, 'GLN': 0, 'GLY': 0, 'SER': 0, 'PHE': 0, 'HIS': 0, 'TYR': 0, 'LEU': 0, 'ASP': 0, 'ASN': 0, 'ARG': 0, 'THR': 0, 'ALA': 0, 'CYS': 0, 'VAL': 0, 'MET': 0 } aalen = float(len(AARec)) for key, value in Counter(AARec).items(): AARecCount[key] = value / aalen AALigCount = { 'LYS': 0, 'PRO': 0, 'ILE': 0, 'TRP': 0, 'GLU': 0, 'GLN': 0, 'GLY': 0, 'SER': 0, 'PHE': 0, 'HIS': 0, 'TYR': 0, 'LEU': 0, 'ASP': 0, 'ASN': 0, 'ARG': 0, 'THR': 0, 'ALA': 0, 'CYS': 0, 'VAL': 0, 'MET': 0 } aalen = float(len(AALig)) for key, value in Counter(AALig).items(): AALigCount[key] = value / aalen countSecRec = { 'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0 } lenSec = float(len(isecRec)) for key, value in Counter(isecRec).items(): countSecRec[key] = value / lenSec countSecLig = { 'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0 } lenSec = float(len(isecLig)) for key, value in Counter(isecLig).items(): countSecLig[key] = value / lenSec interfaces.append({ 'model': struct.id, "recInterfaceResidues": recinterfaceResidues, "ligInterfaceResidues": liginterfaceResidues, "recAA": AARec, 'ligAA': AALig, 'rec_x': list(interfacePosRec[0]), 'rec_y': list(interfacePosRec[1]), 'rec_z': list(interfacePosRec[2]), 'lig_x': list(interfacePosLig[0]), 'lig_y': list(interfacePosLig[1]), 'lig_z': list(interfacePosLig[2]), 'rec_sec': isecRec, 'lig_sec': isecLig, 'countSecRec': countSecRec, 'countSecLig': countSecLig, 'AALigCount': AALigCount, 'AARecCount': AARecCount, 'areaRec': areaRec, 'areaLig': areaLig }) utils.saveToJson(interfaceFile, { 'file': pdb, 'cutoff': cutoff, 'interfaces': interfaces }) except: print("eval interface: FAILED", interfaceFile) pass
def modeEvalFunction(config, setting): pdb_bound = config.getInputFile(setting, 'protein_bound') pdb_unbound = config.getInputFile(setting, 'protein_unbound') mode_file = config.getInputFile(setting, 'mode_file') secondary_file = config.getInputFile(setting, 'secondary') output = config.getOutputFile(setting, 'out') if config.getSetting(setting)['verbose']: print("SETTING: ", setting.upper(), " evaluating modes for", mode_file, " and output to ", output) if not config.getSetting(setting)["dryRun"]: try: bound_list = utils.readFileToList(pdb_bound) unbound_list = utils.readFileToList(pdb_unbound) secondary = [ line[5] for line in utils.getSecLines( utils.readFileToList(secondary_file)) ] currid = None #resMap = {} indices = [] count = 0 for rid in utils.getResidueFromPDBlines(unbound_list): if rid != currid: # resMap[rid] = count indices.append(count) currid = rid count += 1 #resMap = utils.getUniqueResIds(utils.getResidueFromPDBlines(unbound_list)) # indices = list(resMap.values()) # indices.sort() #print(indices) bound_CA = utils.getCAOnlyFromPDBLines(bound_list) unbound_CA = utils.getCAOnlyFromPDBLines(unbound_list) unbound_residues = utils.getResidueNamesFromPDBlines(unbound_CA) bound_CA_pos = utils.getCoordinatesFromPDBlines(bound_CA) unbound_CA_pos = utils.getCoordinatesFromPDBlines(unbound_CA) modes = utils.read_modes(mode_file) cumulative_overlap = 0 eval_dict = {} for modeIdx, mode in modes.items(): #ca_modes = utils.getCAModes(unbound_residues,mode['evec']) ca_modes = [mode['evec'][idx] for idx in indices] area_aa = { 'LYS': 0, 'PRO': 0, 'ILE': 0, 'TRP': 0, 'GLU': 0, 'GLN': 0, 'GLY': 0, 'SER': 0, 'PHE': 0, 'HIS': 0, 'TYR': 0, 'LEU': 0, 'ASP': 0, 'ASN': 0, 'ARG': 0, 'THR': 0, 'ALA': 0, 'CYS': 0, 'VAL': 0, 'MET': 0 } area_sec = { 'C': 0, 'E': 0, 'B': 0, 'T': 0, 'H': 0, 'G': 0, 'b': 0 } integral = 0 for i, vec in enumerate(ca_modes): ampl = vec[0]**2 + vec[1]**2 + vec[2]**2 integral += ampl area_aa[unbound_residues[i]] += ampl area_sec[secondary[i]] += ampl for key in area_aa.keys(): area_aa[key] /= integral for key in area_sec.keys(): area_sec[key] /= integral overlap = utils.getOverlap(unbound_CA_pos, bound_CA_pos, ca_modes) cumulative_overlap += overlap**2 contributionCA = utils.getModeContribution( bound_CA_pos - unbound_CA_pos, ca_modes).tolist() norm = utils.getModeNorm(mode['evec']) contribution = contributionCA * norm magnitude = utils.getModeMagnitude(ca_modes) maximaIndices = utils.getIndexMaxima(magnitude) maxima = magnitude[maximaIndices] eval_dict[modeIdx] = { 'overlap': overlap, 'cum_overlap': np.sqrt(cumulative_overlap), 'eigenvalue': mode['eval'], 'norm': norm, 'contribution': contribution, 'contribution_ca': contributionCA, 'maxima_indices': maximaIndices.tolist(), 'maxima_values': maxima.tolist(), 'area_aa': area_aa, 'area_sec': area_sec } utils.saveToJson( output, { 'bound': pdb_bound, 'unbound': pdb_unbound, 'mode_file': mode_file, 'modes': eval_dict }) except: print("filed to evaluate protein", pdb_unbound) pass
def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000): index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json") # Get tags from file ts = [] idx = 0 n_original_associations = 0 sound_ids = [] if self.verbose: print "Reading index file (%i entries)..." % len(index.items()), for sid, tags in index.items(): ts += tags n_original_associations += len(tags) sound_ids.append(sid) idx += 1 if idx > line_limit: break stats = { 'n_sounds_in_matrix': len(sound_ids), #'biggest_id': max([int(sid) for sid in sound_ids]) } saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats) if self.verbose: print "done!" # Compute tag ocurrences after loading the file tag_occurrences = dict() unique_ts = list(set(ts)) for id, t in enumerate(unique_ts): tag_occurrences[t] = ts.count(t) if self.verbose: sys.stdout.write("\rComputing tag occurrences %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() print "" tags = [] tags_ids = [] for id, t in enumerate(unique_ts): if tag_occurrences[t] >= tag_threshold: tags.append(t) tags_ids.append(id) if self.verbose: sys.stdout.write("\rFiltering tags %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() nTags = len(tags) if self.verbose: print "" print "\tOriginal number of tags: " + str(len(unique_ts)) print "\tTags after filtering: " + str(nTags) # Generate resource-tags dictionary only with filtered tags if self.verbose: print "Reading file for resources...", sys.stdout.flush() res_tags = {} res_user = {} res_tags_no_filt = {} idx = 0 n_filtered_associations = 0 for sid, stags in index.items(): resource = sid user = None assigned_tags = stags assigned_tags_filt = list( set(assigned_tags).intersection(set(tags))) res_tags_no_filt[resource] = assigned_tags res_user[resource] = user if len(assigned_tags_filt) > 0: res_tags[resource] = assigned_tags_filt n_filtered_associations += len(assigned_tags_filt) idx += 1 if idx > line_limit: break resources = res_tags.keys() nResources = len(resources) resources_ids = range(0, nResources) if self.verbose: print "done!" # Generate assocoation matrix if self.verbose: print "\tOriginal number of associations: " + str( n_original_associations) print "\tAssociations after filtering: " + str( n_filtered_associations) if self.verbose: print 'Creating empty array of ' + str(nResources) + ' x ' + str( nTags) + '...', M = spmatrix.ll_mat(nResources, nTags) if self.verbose: print 'done!' done = 0 for r_id in resources: for t in res_tags[r_id]: M[resources.index(r_id), tags.index(t)] = 1 done += 1 if self.verbose: sys.stdout.write( "\rGenerating association matrix %.2f%%" % (float(100 * done) / n_filtered_associations)) sys.stdout.flush() if self.verbose: print "" # Save data if self.verbose: print "Saving association matrix, resource ids, tag ids and tag names" filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day) M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx') save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy', resources) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags) saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json', res_tags, verbose=self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose) return filename
class RecommendationDataProcessor: ''' This class has methods to generate all the files that the tag recommendation systems needs to recommend tags. To generate these files the data processor needs the Index.json file with the tag association information from freesound. The Index.json file must have the following form: { "1142": [ "glitch", "loop", "plucked", "string" ], "1143": [ "glitch", "loop", "plucked", "string" ], ... } The files that are generated by the system are: (for every sound class: Soundscape, Music, Fx, Samples, Speech) [[DATABASE]]_[[CLASSNAME]]_SIMILARITY_MATRIX_cosine_SUBSET.npy [[DATABASE]]_[[CLASSNAME]]_SIMILARITY_MATRIX_cosine_SUBSET_TAG_NAMES.npy ''' verbose = None def __init__(self, verbose=True): self.verbose = verbose def __repr__(self): return "RecommendationDataProcessor instance" def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000): index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json") # Get tags from file ts = [] idx = 0 n_original_associations = 0 sound_ids = [] if self.verbose: print "Reading index file (%i entries)..." % len(index.items()), for sid, tags in index.items(): ts += tags n_original_associations += len(tags) sound_ids.append(sid) idx += 1 if idx > line_limit: break stats = { 'n_sounds_in_matrix': len(sound_ids), #'biggest_id': max([int(sid) for sid in sound_ids]) } saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats) if self.verbose: print "done!" # Compute tag ocurrences after loading the file tag_occurrences = dict() unique_ts = list(set(ts)) for id, t in enumerate(unique_ts): tag_occurrences[t] = ts.count(t) if self.verbose: sys.stdout.write("\rComputing tag occurrences %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() print "" tags = [] tags_ids = [] for id, t in enumerate(unique_ts): if tag_occurrences[t] >= tag_threshold: tags.append(t) tags_ids.append(id) if self.verbose: sys.stdout.write("\rFiltering tags %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() nTags = len(tags) if self.verbose: print "" print "\tOriginal number of tags: " + str(len(unique_ts)) print "\tTags after filtering: " + str(nTags) # Generate resource-tags dictionary only with filtered tags if self.verbose: print "Reading file for resources...", sys.stdout.flush() res_tags = {} res_user = {} res_tags_no_filt = {} idx = 0 n_filtered_associations = 0 for sid, stags in index.items(): resource = sid user = None assigned_tags = stags assigned_tags_filt = list( set(assigned_tags).intersection(set(tags))) res_tags_no_filt[resource] = assigned_tags res_user[resource] = user if len(assigned_tags_filt) > 0: res_tags[resource] = assigned_tags_filt n_filtered_associations += len(assigned_tags_filt) idx += 1 if idx > line_limit: break resources = res_tags.keys() nResources = len(resources) resources_ids = range(0, nResources) if self.verbose: print "done!" # Generate assocoation matrix if self.verbose: print "\tOriginal number of associations: " + str( n_original_associations) print "\tAssociations after filtering: " + str( n_filtered_associations) if self.verbose: print 'Creating empty array of ' + str(nResources) + ' x ' + str( nTags) + '...', M = spmatrix.ll_mat(nResources, nTags) if self.verbose: print 'done!' done = 0 for r_id in resources: for t in res_tags[r_id]: M[resources.index(r_id), tags.index(t)] = 1 done += 1 if self.verbose: sys.stdout.write( "\rGenerating association matrix %.2f%%" % (float(100 * done) / n_filtered_associations)) sys.stdout.flush() if self.verbose: print "" # Save data if self.verbose: print "Saving association matrix, resource ids, tag ids and tag names" filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day) M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx') save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy', resources) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags) saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json', res_tags, verbose=self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose) return filename def association_matrix_to_similarity_matrix(self, metric="cosine", dataset="FREESOUND", save_sim=False, training_set=None, out_name_prefix="", is_general_recommender=False): if self.verbose: print "Loading association matrix and tag names, ids files..." try: M = spmatrix.ll_mat_from_mtx(RECOMMENDATION_TMP_DATA_DIR + dataset + "_ASSOCIATION_MATRIX.mtx") resource_ids = load(RECOMMENDATION_TMP_DATA_DIR + dataset + "_RESOURCE_IDS.npy") tag_names = load(RECOMMENDATION_TMP_DATA_DIR + dataset + "_TAG_NAMES.npy") except Exception: raise Exception( "Error loading association matrix and tag names, ids data") if metric not in ['cosine', 'binary', 'coocurrence', 'jaccard']: raise Exception("Wrong similarity metric specified") if self.verbose: print "Computing similarity matrix from a resource subset of the whole association matrix..." # Get index of resources to train (usable index for M) resource_id_positions = where( in1d(resource_ids, training_set, assume_unique=True))[0] # Matrix multiplication (only taking in account resources in training set and ALL tags) MM = spmatrix.dot(M[resource_id_positions, :], M[resource_id_positions, :]) # Get similarity matrix sim_matrix = spmatrix.ll_mat(MM.shape[0], MM.shape[0]) non_zero_index = MM.keys() for index in non_zero_index: if metric == 'cosine': sim_matrix[index[0], index[1]] = MM[index[0], index[1]] * ( 1 / (sqrt(MM[index[0], index[0]]) * sqrt(MM[index[1], index[1]]))) elif metric == 'coocurrence': sim_matrix[index[0], index[1]] = MM[index[0], index[1]] elif metric == 'binary': sim_matrix[index[0], index[1]] = MM[index[0], index[1]] / MM[index[0], index[1]] elif metric == 'jaccard': sim_matrix[index[0], index[1]] = MM[index[0], index[1]] * ( 1 / (MM[index[0], index[0]] + MM[index[1], index[1]] - MM[index[0], index[1]])) # Clean out similarity matrix (clean tags that are not used) tag_positions = [] for i in range(0, sim_matrix.shape[0]): if sim_matrix[i, i] != 0.0: tag_positions.append(i) # Transform sparse similarity matrix to npy format sim_matrix_npy = mtx2npy(sim_matrix[tag_positions, tag_positions]) tag_names_sim_matrix = tag_names[tag_positions] if save_sim: if not is_general_recommender: # Save sim path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_%s_SIMILARITY_MATRIX_" % out_name_prefix + metric + "_SUBSET.npy" if self.verbose: print "Saving to " + path + "..." save(path, sim_matrix_npy) # Save tag names path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_%s_SIMILARITY_MATRIX_" % out_name_prefix + metric + "_SUBSET_TAG_NAMES.npy" if self.verbose: print "Saving to " + path + "..." save(path, tag_names_sim_matrix) else: # Save sim path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_SIMILARITY_MATRIX_" + metric + ".npy" if self.verbose: print "Saving to " + path + "..." save(path, sim_matrix_npy) # Save tag names path = RECOMMENDATION_TMP_DATA_DIR + dataset + "_SIMILARITY_MATRIX_" + metric + "_TAG_NAMES.npy" if self.verbose: print "Saving to " + path + "..." save(path, tag_names_sim_matrix) return { 'SIMILARITY_MATRIX': sim_matrix_npy, 'TAG_NAMES': tag_names_sim_matrix } def process_tag_recommendation_data(self, resources_limit=None, tag_threshold=10, line_limit=99999999999999, recompute_all_classes=False, similarity_metric="cosine"): # Process tas file and turn into association matrix and derived files database_name = self.tas_to_association_matrix( tag_threshold=tag_threshold, line_limit=line_limit) print "Loading community detector..." cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier") print cd # Classify existing resources resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json') instances_ids = resources_tags.keys() try: resource_class = loadFromJson( RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json') except Exception, e: resource_class = dict() for count, id in enumerate(instances_ids): if not recompute_all_classes: if id not in resource_class: resource_class[id] = cd.detectCommunity( input_tags=resources_tags[id]) else: resource_class[id] = cd.detectCommunity( input_tags=resources_tags[id]) if self.verbose: sys.stdout.write("\rClassifying resources... %.2f%%" % (float(100 * (count + 1)) / len(instances_ids))) sys.stdout.flush() print "" saveToJson( RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json', resource_class) print "" print "\nComputing data for general recommender..." self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=instances_ids[0:resources_limit], save_sim=True, is_general_recommender=True, metric=similarity_metric, ) print "\nComputing data for class recommenders..." instance_id_class = [] distinct_classes = [] for count, instance_id in enumerate(instances_ids): class_id = resource_class[instance_id] instance_id_class.append([instance_id, class_id]) if class_id not in distinct_classes: distinct_classes.append(class_id) print distinct_classes for collection_id in distinct_classes: print "\nComputing recommender for collection %s..." % collection_id # All resources from the training set classified as the selected category # (instead of all manually labeled) training_ids = [] for instance in instance_id_class: if instance[1] == collection_id: training_ids.append(instance[0]) # Add limit training_ids = training_ids[0:resources_limit] if len(training_ids) < 1: raise Exception("Too less training ids for collection %s" % collection_id) self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=training_ids, save_sim=True, out_name_prefix=collection_id, is_general_recommender=False, metric=similarity_metric, )
def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000): index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json") # Get tags from file ts = [] idx = 0 n_original_associations = 0 sound_ids = [] if self.verbose: print "Reading index file (%i entries)..." % len(index.items()), for sid, tags in index.items(): ts += tags n_original_associations += len(tags) sound_ids.append(sid) idx += 1 if idx > line_limit: break stats = { 'n_sounds_in_matrix': len(sound_ids), #'biggest_id': max([int(sid) for sid in sound_ids]) } saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats) if self.verbose: print "done!" # Compute tag ocurrences after loading the file tag_occurrences = dict() unique_ts = list(set(ts)) for id, t in enumerate(unique_ts): tag_occurrences[t] = ts.count(t) if self.verbose: sys.stdout.write("\rComputing tag occurrences %.2f%%"%(float(100*(id+1))/len(unique_ts))) sys.stdout.flush() print "" tags = [] tags_ids = [] for id, t in enumerate(unique_ts): if tag_occurrences[t] >= tag_threshold: tags.append(t) tags_ids.append(id) if self.verbose: sys.stdout.write("\rFiltering tags %.2f%%"%(float(100*(id+1))/len(unique_ts))) sys.stdout.flush() nTags = len(tags) if self.verbose: print "" print "\tOriginal number of tags: " + str(len(unique_ts)) print "\tTags after filtering: " + str(nTags) # Generate resource-tags dictionary only with filtered tags if self.verbose: print "Reading file for resources...", sys.stdout.flush() res_tags = {} res_user = {} res_tags_no_filt = {} idx = 0 n_filtered_associations = 0 for sid, stags in index.items(): resource = sid user = None assigned_tags = stags assigned_tags_filt = list(set(assigned_tags).intersection(set(tags))) res_tags_no_filt[resource] = assigned_tags res_user[resource] = user if len(assigned_tags_filt) > 0: res_tags[resource] = assigned_tags_filt n_filtered_associations += len(assigned_tags_filt) idx += 1 if idx > line_limit: break resources = res_tags.keys() nResources = len(resources) resources_ids = range(0,nResources) if self.verbose: print "done!" # Generate assocoation matrix if self.verbose: print "\tOriginal number of associations: " + str(n_original_associations) print "\tAssociations after filtering: " + str(n_filtered_associations) if self.verbose: print 'Creating empty array of ' + str(nResources) + ' x ' + str(nTags) + '...', M = spmatrix.ll_mat(nResources, nTags) if self.verbose: print 'done!' done = 0 for r_id in resources: for t in res_tags[r_id]: M[resources.index(r_id),tags.index(t)] = 1 done += 1 if self.verbose: sys.stdout.write("\rGenerating association matrix %.2f%%" % (float(100*done)/n_filtered_associations)) sys.stdout.flush() if self.verbose: print "" # Save data if self.verbose: print "Saving association matrix, resource ids, tag ids and tag names" filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day) M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx') save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',resources) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy',tags_ids) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy',tags) saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json',res_tags, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose) return filename
def process_tag_recommendation_data(self, resources_limit=None, tag_threshold=10, line_limit=99999999999999, recompute_all_classes=False, similarity_metric="cosine"): # Process tas file and turn into association matrix and derived files database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit) print "Loading community detector..." cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier") print cd # Classify existing resources resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json') instances_ids = resources_tags.keys() try: resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json') except Exception as e: resource_class = dict() for count, id in enumerate(instances_ids): if not recompute_all_classes: if id not in resource_class: resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id]) else: resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id]) if self.verbose: sys.stdout.write("\rClassifying resources... %.2f%%"%(float(100*(count+1))/len(instances_ids))) sys.stdout.flush() print "" saveToJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json', resource_class) print "" print "\nComputing data for general recommender..." self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=instances_ids[0:resources_limit], save_sim=True, is_general_recommender=True, metric=similarity_metric, ) print "\nComputing data for class recommenders..." instance_id_class = [] distinct_classes = [] for count, instance_id in enumerate(instances_ids): class_id = resource_class[instance_id] instance_id_class.append([instance_id, class_id]) if class_id not in distinct_classes: distinct_classes.append(class_id) print distinct_classes for collection_id in distinct_classes: print "\nComputing recommender for collection %s..." % collection_id # All resources from the training set classified as the selected category # (instead of all manually labeled) training_ids = [] for instance in instance_id_class: if instance[1] == collection_id: training_ids.append(instance[0]) # Add limit training_ids = training_ids[0:resources_limit] if len(training_ids) < 1: raise Exception("Too less training ids for collection %s" % collection_id) self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=training_ids, save_sim=True, out_name_prefix=collection_id, is_general_recommender=False, metric=similarity_metric, )