def getVectorsKeywords(movie_strings, keywords): multi_dictionary = {} dict_mapping = {} movie_names = [] file_names = dt.getAllFileNames("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping") for i in movie_strings: movie_names.append(i.strip()[:-5]) print i print "Mapping to memory." for i in file_names: try: file = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/" +i, "r") lines = file.readlines() dict_mapping[movie_strings[int(i)]] = i for line in lines: line = line.strip() multi_dictionary[(movie_strings[int(i)], line)] = 1 file.close() except IOError: print movie_names[i] for up in range(len(keywords)): keywords[up] = keywords[up].strip() print len("Iterating over memory.") for p in range(len(keywords)): vector = [0 for x in range(len(movie_strings))] print len(vector) for key, value in multi_dictionary.iteritems(): if key[1] == keywords[p]: #print int(dict_mapping[key[0]]) vector[int(dict_mapping[key[0]])] = 1 print keywords[p] dt.write1dArray(vector, "filmdata/classesKeywords/NewData/class-" + keywords[p])
def writeMissing(folder_name): print "?" file_names = dt.getAllFileNames(folder_name) standard = range(15000) missing = [] for i in standard: found = False for f in file_names: if int(f) == int(i): found = True break if found: print "found", i else: missing.append(i) print "no found", i dt.write1dArray(missing, "filmdata/MISSING_KEYWORD_ITEMS.txt")