def LDA(tf, names, components, file_name, doc_topic_prior, topic_word_prior, data_type, rewrite_files): # Removed model name as it was unused and I manually renamed a bunch of files and was too lazy to do model too rep_name = "../data/" + data_type + "/LDA/rep/" + file_name + ".txt" model_name = "../data/" + data_type + "/LDA/model/" + file_name + ".txt" names_name = "../data/" + data_type + "/LDA/names/" + file_name + ".txt" all_names = [rep_name, names_name] if dt.allFnsAlreadyExist(all_names) and not rewrite_files: print("Already completed") return print(len(tf), print(len(tf[0]))) print("Fitting LDA models with tf features,") lda = LatentDirichletAllocation(doc_topic_prior=doc_topic_prior, topic_word_prior=topic_word_prior, n_topics=components) t0 = time() tf = np.asarray(tf).transpose() new_rep = lda.fit_transform(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") topics = print_top_words(lda, names) topics.reverse() dt.write1dArray( topics, "../data/" + data_type + "/LDA/names/" + file_name + ".txt") dt.write2dArray(new_rep.transpose(), rep_name) joblib.dump(lda, model_name)
def makeTopVectors(filename): vectors = dt.import2dArray("Rankings/" + filename + ".space") top250names = dt.import1dArray("filmdata/top250.txt") film_names = dt.import1dArray("filmdata/filmNames.txt") indexes = [] ordered_names = [] for f in range(len(film_names)): for t in top250names: if film_names[f] == t: indexes.append(f) ordered_names.append(t) top_vectors = [[]] for v in range(len(vectors)): if v > 0: top_vectors.append([]) for i in range(len(vectors[v])): for id in indexes: if i == id: top_vectors[v].append(vectors[v][i]) dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space") dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax): file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str( sampling_threshold) + \ " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str( worker_count) + "spacy" " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \ " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax) corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt" if os.path.exists(corpus_fn) is False: x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy") x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy") corpus = np.concatenate((x_train, x_test), axis=0) text_corpus = np.empty(len(corpus), dtype=np.object) for i in range(len(corpus)): text_corpus[i] = " ".join(corpus[i]) print(text_corpus[i]) dt.write1dArray(text_corpus, corpus_fn) embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt" model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin" vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy" score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score" if os.path.exists(model_fn): print("Imported model") model = g.utils.SaveLoad.load(model_fn) elif file_name[:7] == "Doc2Vec": model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax) model.save(model_fn) if os.path.exists(vector_fn) is False: vectors = [] for d in range(len(model.docvecs)): vectors.append(model.docvecs[d]) np.save(vector_fn, vectors) else: print("Imported vectors") vectors = np.load(vector_fn) if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec": print("Getting score") if data_type == "sentiment": classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes) scores = linearSVMScore(x_train, y_train, x_test, y_test) else: classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes) scores = multiClassLinearSVM(x_train, y_train, x_test, y_test) print(scores) dt.write1dArray(scores, score_fn)
def convertEntityNamesToIDS(ID_fn, all_names_fn, individual_names_fn, output_fn): ID_fn = dt.import1dArray(ID_fn) all_names_fn = dt.import1dArray(all_names_fn) individual_names_fn = dt.import1dArray(individual_names_fn) indexes = [] for n in range(len(all_names_fn)): for name in individual_names_fn: if all_names_fn[n] == name: indexes.append(n) dt.write1dArray(np.asarray(ID_fn)[indexes], output_fn)
def main(data_type): if data_type == "newsgroups": corpus = fetch_20newsgroups(subset='all', shuffle=False, remove=("headers", "footers", "quotes")).data tokenized_corpus, text_corpus = tokenizeLowercaseSpacy(corpus) np.save("../data/raw/newsgroups/corpus.npy", tokenized_corpus) dt.write1dArray(text_corpus, "../data/raw/newsgroups/corpus_processed.txt")
def getAvailableEntities(entity_names_fns, data_type, classification): entity_names = [] for e in entity_names_fns: entity_names.append(dt.import1dArray(e)) dict = {} for entity_name in entity_names: for name in entity_name: dict[name] = 0 available_entities = [] for key in dict: available_entities.append(key) dt.write1dArray(available_entities, "../data/"+data_type+"/classify/"+classification+"available_entities.txt")
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name): # Get the entities we have phrases for entity_names = dt.import1dArray(entity_names_fn) # Import multi classes multi_class = dt.import1dArray(multi_class_fn) class_names = [] class_val = [] highest_class = 0 for line in multi_class: cn, cv = re.split(r'\t+', line) cv = int(cv) class_names.append(cn) class_val.append(cv) if cv > highest_class: highest_class = cv matched_entity_names = list(set(entity_names).intersection(class_names)) matched_entity_names.sort() dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt") indexes_to_delete = [] for n in range(len(class_names)): found = False for en in range(len(matched_entity_names)): if class_names[n] == matched_entity_names[en]: found=True break if found is False: indexes_to_delete.append(n) class_val = np.delete(class_val, indexes_to_delete) classes = [] print("Found " + str(highest_class) + " classes") for e in range(len(matched_entity_names)): class_a = [0] * highest_class class_a[class_val[e]-1] = 1 classes.append(class_a) dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all") print("Wrote class all") classes = np.asarray(classes).transpose() for cn in range(len(classes)): dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn)) print("Wrote", "class-"+str(cn))
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name): available_indexes = dt.import1dArray(available_indexes_fn) rankings = np.asarray(dt.import2dArray(rankings_fn)) names = dt.import1dArray(names) trimmed_rankings = [] for r in range(len(rankings)): trimmed = rankings[r].take(available_indexes) trimmed_rankings.append(trimmed) for a in range(len(trimmed_rankings)): print("Writing", names[a]) dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a]) print("Writing", rankings_fn[-6:]) dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
def getNDCG(rankings_fn, fn, data_type, bow_fn, ppmi_fn, lowest_count, rewrite_files=False, highest_count=0, classification=""): # Check if the NDCG scores have already been calculated, if they have then skip. ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt" all_fns = [ndcg_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", getNDCG.__name__) return else: print("Running task", getNDCG.__name__) # Get the file names for the PPMI values for every word and a list of words ("names") names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn) ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn) # Process the rankings and the PPMI line-by-line so as to not run out of memory ndcg_a = [] #spearman_a = [] with open(rankings_fn) as rankings: r = 0 for lr in rankings: for lp in ppmi: # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50" sorted_indices = np.argsort( list(map(float, lr.strip().split())))[::-1] # Convert PPMI scores to floats # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank ndcg = ndcg_from_ranking(lp, sorted_indices) # Add to array and print ndcg_a.append(ndcg) print("ndcg", ndcg, names[r], r) """ smr = spearmanr(ppmi_indices, sorted_indices)[1] spearman_a.append(smr) print("spearman", smr, names[r], r) """ r += 1 break # Save NDCG dt.write1dArray(ndcg_a, ndcg_fn)
def get_code(self, tree, feature_names, class_names, filename, data_type): rules_array = [] dt.write1dArray( rules_array, "../data/" + data_type + "/rules/text_rules/" + filename + ".txt") # Probably not needed cleaned = jsbeautifier.beautify_file("../data/" + data_type + "/rules/text_rules/" + filename + ".txt") file = open( "../data/" + data_type + "/rules/text_rules/" + filename + ".txt", "w") file.write(cleaned) file.close()
def getSimilarClusters(cluster_dict_1, cluster_dict_2, trim_amt, file_name, data_type, threshold_for_stopping, threshold_for_stopping_1): matching_clusters = np.zeros(len(cluster_dict_1), dtype=np.int32) new_cluster_dict_2 = [] for c in cluster_dict_2: new_cluster_dict_2.append(np.flipud(c)) cluster_dict_2 = None cluster_dict_2 = new_cluster_dict_2 positions = np.zeros(len(cluster_dict_1)) for c in range(len(cluster_dict_1)): print(c) lowest_pos = 242343 lowest_cluster = len(cluster_dict_2)-1 for n in range(len(cluster_dict_1[c])): if n > threshold_for_stopping_1: break name_to_match = cluster_dict_1[c][n] if ":" in name_to_match: name_to_match = name_to_match[:-1] for c2 in range(len(cluster_dict_2)): for n2 in range(len(cluster_dict_2[c2])): if n2 > threshold_for_stopping: break name_to_match2 = cluster_dict_2[c2][n2] if ":" in name_to_match2: name_to_match2 = name_to_match2[:-1] if name_to_match == name_to_match2: if n2 < lowest_pos: lowest_cluster = c2 lowest_pos = n2 break matching_clusters[c] = lowest_cluster positions[c] = lowest_pos sorted_matching_indexes = matching_clusters[np.argsort(positions)] sorted_orig_indexes = np.asarray(list(range(len(cluster_dict_1))))[np.argsort(positions)] print("_--------------------------------------------------") print("SORTED") print("_--------------------------------------------------") lines = [] for c in range(len(sorted_orig_indexes)): line_p1 = "" for n in cluster_dict_1[sorted_orig_indexes[c]][:trim_amt]: line_p1 = line_p1 + n + " " line_pl2 = "" for k in cluster_dict_2[sorted_matching_indexes[c]][:trim_amt]: line_pl2 = line_pl2 + k + " " line = line_p1 + " |||| " + line_pl2 lines.append(line) print(line) dt.write1dArray(lines, "../data/" + data_type + "/investigate/" + file_name + str(trim_amt) + ".txt")
def printIndividualFromAll(data_type, type, lowest_count, max, classification, all_fn=None, names_array = None): fn = "../data/" + data_type + "/bow/" if all_fn is None: all_fn = fn + type + "/class-all-"+str(lowest_count)+"-"+str(max)+"-"+str(classification) if names_array is None: names = dt.import1dArray(fn + "names/"+str(lowest_count)+"-"+str(max)+"-"+str(classification)+".txt") else: names = names_array with open(all_fn) as all: c = 0 for la in all: convert = dt.convertLine(la) dt.write1dArray(convert, fn+ type+"/class-"+str(names[c]+"-"+str(lowest_count)+"-"+str(max)+"-"+str(classification))) print(c, len(names), names[c]) c+=1 print("wrote individual from all")
def getTop250Movies(entity_names): imdb = dt.import1dArray("../data/raw/imdb/ratings/ratings.list")[28:278] orig_en = entity_names for e in range(len(entity_names)): entity_names[e] = "".join(entity_names[e].split()[:-1]) entity_names[e] = dt.removeEverythingFromString(entity_names[e]) top_en = [] for string in imdb: string =string.split(".")[1][1:] string =string.split()[:-1] string = " ".join(string) string = dt.removeEverythingFromString(string) top_en.append(string) matched_index = [] for e in range(len(entity_names)): for x in range(len(top_en)): if entity_names[e] == top_en[x]: matched_index.append(e) print(entity_names[e]) break dt.write1dArray(matched_index, "../data/movies/top_imdb_indexes.txt")
def writeClassesFromNames(folder_name, file_names, output_folder): names = dt.getFolder(folder_name) all_names = defaultdict(int) entity_names = dt.import1dArray(file_names) translator = str.maketrans({key: None for key in string.punctuation}) for type in range(len(names)): for n in range(len(names[type])): names[type][n] = dt.removeEverythingFromString(names[type][n]) all_names[names[type][n]] += 1 available_class_names = [] available_indexes = [] for n in range(len(entity_names)): name = entity_names[n] original_name = name name = dt.removeEverythingFromString(name) if all_names[name] > 0: available_class_names.append(original_name) available_indexes.append(n) print(name, "exists") else: print(name, "FAIL") dt.write1dArray(available_indexes, output_folder + "available_indexes.txt") dt.write1dArray(available_class_names, output_folder + "available_entities.txt") print("Wrote available indexes and entities") class_all = [] for c in range(len(names)): binary_class = [] for n in range(len(available_class_names)): available_class_names[n] = dt.removeEverythingFromString(available_class_names[n]) if available_class_names[n] in names[c]: binary_class.append(1) else: binary_class.append(0) dt.write1dArray(binary_class, output_folder + "class-"+str(c)+"") class_all.append(binary_class) dt.write2dArray(np.asarray(class_all).transpose(), output_folder + "class-all") print("Wrote class-all")
for n in range(len(entities_unique)): if clean_ent_unique[n] == clean_us_ent[i]: new_class_all[a+4][n] = 1 break names = ["UK-PG", "UK-12-12A", "UK-15", "UK-18", "USA-G", "USA-PG-PG13", "USA-R" ] for i in range(len(new_class_all)): dt.write1dArray(new_class_all[i], "../data/movies/classify/ratings/class-" + names[i]) new_class_all = np.asarray(new_class_all).transpose() dt.write2dArray(new_class_all, "../data/movies/classify/ratings/class-all") dt.write1dArray(entities_unique, "../data/movies/classify/ratings/available_entities.txt") """ get_all = False additional_name = "" #make_individual = True make_individual = False sparse_matrix = False print("??") class_type = "movies" classification = "all"
def removeClass(array_fn): array = dt.import1dArray(array_fn) for e in range(len(array)): array[e] = array[e][6:] dt.write1dArray(array, array_fn)
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/"): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if self.get_f1 is False: print("running svms") kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(ktau_scores, ktau_scores_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def __init__(self, class_path=None, get_scores=False, randomize_finetune_weights=False, dropout_noise=None, amount_of_hidden=0, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, past_model_bias_fn=None, identity_swap=False, reg=0.0, amount_of_finetune=[], output_size=25, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", deep_size=None, corrupt_finetune_weights=False, split_to_use=-1, hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, is_identity=False, finetune_size=0, data_type="movies", optimizer_name="rmsprop", noise=0.0, fine_tune_weights_fn=None, past_model_weights_fn=None, from_ae=True, save_outputs=False, label_names_fn="", rewrite_files=False, cv_splits=1, cutoff_start=0.2, development=False, class_weight=None, csv_fn=None, tune_vals=False, get_nnet_vectors_path=None, classification_name="all", limit_entities=False, limited_label_fn="", vector_names_fn="", identity_activation="linear", loc="../data/", lock_weights_and_redo=False): total_file_name = loc + data_type + "/nnet/spaces/" + file_name weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt" bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt" rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt" all_fns = [weights_fn, bias_fn, rank_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "nnet") return else: print("Running task", "nnet") self.class_path = class_path self.learn_rate = learn_rate self.epochs = epochs self.loss = loss self.batch_size = batch_size self.hidden_activation = hidden_activation self.layer_init = layer_init self.output_activation = output_activation self.hidden_layer_size = hidden_layer_size self.file_name = file_name self.vector_path = vector_path self.dropout_noise = dropout_noise self.finetune_size = finetune_size self.get_scores = get_scores self.reg = reg self.amount_of_finetune = amount_of_finetune self.amount_of_hidden = amount_of_hidden self.output_size = output_size self.identity_swap = identity_swap self.deep_size = deep_size self.from_ae = from_ae self.is_identity = is_identity self.randomize_finetune_weights = randomize_finetune_weights self.corrupt_finetune_weights = corrupt_finetune_weights self.deep_size = deep_size self.fine_tune_weights_fn = fine_tune_weights_fn self.identity_activation = identity_activation self.lock_weights_and_redo = lock_weights_and_redo print(data_type) if optimizer_name == "adagrad": self.optimizer = Adagrad() elif optimizer_name == "sgd": self.optimizer = SGD() elif optimizer_name == "rmsprop": self.optimizer = RMSprop() elif optimizer_name == "adam": self.optimizer = Adam() elif optimizer_name == "adadelta": self.optimizer = Adadelta() else: print("optimizer not found") exit() entity_vectors = np.asarray(dt.import2dArray(self.vector_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) if get_nnet_vectors_path is not None: nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) entity_classes = np.asarray(dt.import2dArray(self.class_path)) print("Imported classes", len(entity_classes), len(entity_classes[0])) if fine_tune_weights_fn is None: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) entity_vectors = np.asarray( dt.match_entities(entity_vectors, limited_labels, vector_names)) if fine_tune_weights_fn is not None: if len(entity_vectors) != len(entity_classes): entity_classes = entity_classes.transpose() print("Transposed classes, now in form", len(entity_classes), len(entity_classes[0])) """ # IF Bow if len(entity_vectors[0]) != len(entity_classes[0]): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) """ elif len(entity_vectors) != len(entity_classes): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) self.input_size = len(entity_vectors[0]) self.output_size = len(entity_classes[0]) if fine_tune_weights_fn is not None: model_builder = self.fineTuneNetwork weights = [] if from_ae: self.past_weights = [] past_model_weights = [] for p in past_model_weights_fn: past_model_weights.append( np.asarray(dt.import2dArray(p), dtype="float64")) past_model_bias = [] for p in past_model_bias_fn: past_model_bias.append( np.asarray(dt.import1dArray(p, "f"), dtype="float64")) for p in range(len(past_model_weights)): past_model_weights[p] = np.around(past_model_weights[p], decimals=6) past_model_bias[p] = np.around(past_model_bias[p], decimals=6) for p in range(len(past_model_weights)): self.past_weights.append([]) self.past_weights[p].append(past_model_weights[p]) self.past_weights[p].append(past_model_bias[p]) for f in fine_tune_weights_fn: weights.extend(dt.import2dArray(f)) r = np.asarray(weights, dtype="float64") r = np.asarray(weights, dtype="float64") for a in range(len(r)): r[a] = np.around(r[a], decimals=6) for a in range(len(entity_classes)): entity_classes[a] = np.around(entity_classes[a], decimals=6) self.fine_tune_weights = [] self.fine_tune_weights.append(r.transpose()) self.fine_tune_weights.append( np.zeros(shape=len(r), dtype="float64")) else: model_builder = self.classifierNetwork models = [] x_train = [] y_train = [] x_test = [] y_test = [] x_dev = [] y_dev = [] train_x_c = [] train_y_c = [] c = 0 for i in range(cv_splits): if split_to_use > -1: if c != split_to_use: c += 1 continue models.append(model_builder()) c += 1 # Converting labels to categorical f1_scores = [] accuracy_scores = [] f1_averages = [] accuracy_averages = [] if cv_splits == 1: k_fold = KFold(n_splits=3, shuffle=False, random_state=None) else: k_fold = KFold(n_splits=cv_splits, shuffle=False, random_state=None) c = 0 for train, test in k_fold.split(entity_vectors): if split_to_use > -1: if c != split_to_use: c += 1 continue x_train.append(entity_vectors[train[:int(len(train) * 0.8)]]) y_train.append(entity_classes[train[:int(len(train) * 0.8)]]) x_test.append(entity_vectors[test]) y_test.append(entity_classes[test]) x_dev.append(entity_vectors[train[int(len(train) * 0.8):len(train)]]) y_dev.append(entity_classes[train[int(len(train) * 0.8):len(train)]]) train_x_c, train_y_c = entity_vectors[ train[:int(len(train) * 0.8)]], entity_classes[train[:int(len(train) * 0.8)]] if fine_tune_weights_fn is not None: train_x_c = entity_vectors train_y_c = entity_classes hist = models[0].fit(train_x_c, train_y_c, nb_epoch=self.epochs, batch_size=self.batch_size, verbose=1, class_weight=class_weight) print(hist.history) c += 1 if cv_splits == 1 or split_to_use == c: break if lock_weights_and_redo: print("REDO WITH LOCKED WEIGHTS") unlocked_model = Sequential() for l in range(0, len(models[0].layers) - 1): unlocked_model.add(models[0].layers[l]) self.end_space = unlocked_model.predict(entity_vectors) total_file_name = loc + data_type + "/nnet/spaces/" + file_name dt.write2dArray(self.end_space, total_file_name + "L" + str(l) + "LSPACE" + ".txt") unlocked_model.add( Dense(output_dim=finetune_size, input_dim=self.hidden_layer_size, activation="linear", weights=self.fine_tune_weights)) # unlocked_model.compile(loss=self.loss, optimizer=self.optimizer) models[0] = unlocked_model hist = models[0].fit(train_x_c, train_y_c, nb_epoch=self.epochs, batch_size=self.batch_size, verbose=1, class_weight=class_weight) original_fn = file_name for m in range(len(models)): if development: x_test[m] = x_dev[m] y_test[m] = y_dev[m] if get_scores: vals_to_try = np.arange(start=cutoff_start, stop=1, step=0.01) test_pred = models[m].predict(x_train[m]).transpose() print(test_pred) y_train_m = np.asarray(y_train[m]).transpose() highest_f1 = [0] * len(test_pred) highest_vals = [0.2] * len(test_pred) if tune_vals: for c in range(len(test_pred)): for val in vals_to_try: test_pred_c = np.copy(test_pred[c]) test_pred_c[test_pred_c >= val] = 1 test_pred_c[test_pred_c < val] = 0 acc = accuracy_score(y_train_m[c], test_pred_c) f1 = f1_score(y_train_m[c], test_pred_c, average="binary") f1 = (f1 + acc) / 2 if f1 > highest_f1[c]: highest_f1[c] = f1 highest_vals[c] = val print("optimal f1s", highest_f1) print("optimal vals", highest_vals) y_pred = models[m].predict(x_test[m]).transpose() y_test[m] = np.asarray(y_test[m]).transpose() for y in range(len(y_pred)): y_pred[y][y_pred[y] >= highest_vals[y]] = 1 y_pred[y][y_pred[y] < highest_vals[y]] = 0 f1_array = [] accuracy_array = [] for y in range(len(y_pred)): accuracy_array.append( accuracy_score(y_test[m][y], y_pred[y])) f1_array.append( f1_score(y_test[m][y], y_pred[y], average="binary")) print(f1_array[y]) y_pred = y_pred.transpose() y_test[m] = np.asarray(y_test[m]).transpose() micro_average = f1_score(y_test[m], y_pred, average="micro") cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt" cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt" dt.write1dArray(f1_array, cv_f1_fn) dt.write1dArray(accuracy_array, cv_acc_fn) f1_scores.append(f1_array) accuracy_scores.append(accuracy_array) f1_average = np.average(f1_array) accuracy_average = np.average(accuracy_array) f1_averages.append(f1_average) accuracy_averages.append(accuracy_average) print("Average F1 Binary", f1_average, "Acc", accuracy_average) print("Micro Average F1", micro_average) f1_array.append(f1_average) f1_array.append(micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) scores = [accuracy_array, f1_array] csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv" file_names = [file_name + "ACC", file_name + "F1"] label_names = dt.import1dArray(label_names_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", [file_name], scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) if save_outputs: if limit_entities is False: self.output_clusters = models[m].predict(nnet_vectors) else: self.output_clusters = models[m].predict(entity_vectors) self.output_clusters = self.output_clusters.transpose() dt.write2dArray(self.output_clusters, rank_fn) for l in range(0, len(models[m].layers) - 1): if dropout_noise is not None and dropout_noise > 0.0: if l % 2 == 1: continue print("Writing", l, "layer") truncated_model = Sequential() for a in range(l + 1): truncated_model.add(models[m].layers[a]) truncated_model.compile(loss=self.loss, optimizer="sgd") if get_nnet_vectors_path is not None: self.end_space = truncated_model.predict(nnet_vectors) else: self.end_space = truncated_model.predict(entity_vectors) total_file_name = loc + data_type + "/nnet/spaces/" + file_name dt.write2dArray(self.end_space, total_file_name + "L" + str(l) + ".txt") for l in range(len(models[m].layers)): try: dt.write2dArray( models[m].layers[l].get_weights()[0], loc + data_type + "/nnet/weights/" + file_name + "L" + str(l) + ".txt") dt.write1dArray( models[m].layers[l].get_weights()[1], loc + data_type + "/nnet/bias/" + file_name + "L" + str(l) + ".txt") except IndexError: print("Layer ", str(l), "Failed") if cv_splits > 1: class_f1_averages = [] class_accuracy_averages = [] f1_scores = np.asarray(f1_scores).transpose() accuracy_scores = np.asarray(accuracy_scores).transpose() for c in range(len(f1_scores)): class_f1_averages.append(np.average(f1_scores[c])) class_accuracy_averages.append(np.average(accuracy_scores[c])) f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt" acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt" dt.write1dArray(class_f1_averages, f1_fn) dt.write1dArray(class_accuracy_averages, acc_fn) overall_f1_average = np.average(f1_averages) overall_accuracy_average = np.average(accuracy_averages)
def __init__(self, features_fn, classes_fn, class_names_fn, cluster_names_fn, filename, max_depth=None, balance=None, criterion="entropy", save_details=False, data_type="movies", cv_splits=5, csv_fn="../data/temp/no_csv_provided.csv", rewrite_files=True, split_to_use=-1, development=False, limit_entities=False, limited_label_fn=None, vector_names_fn=None, pruning=1, save_results_so_far=False): vectors = np.asarray(dt.import2dArray(features_fn)).transpose() labels = np.asarray(dt.import2dArray(classes_fn, "i")) print("vectors", len(vectors), len(vectors[0])) print("labels", len(labels), len(labels[0])) print("vectors", len(vectors), len(vectors[0])) cluster_names = dt.import1dArray(cluster_names_fn) label_names = dt.import1dArray(class_names_fn) all_fns = [] file_names = ['ACC J48' + filename, 'F1 J48' + filename] acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 0] + '.scores' f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 1] + '.scores' all_fns.append(acc_fn) all_fns.append(f1_fn) all_fns.append(csv_fn) print(dt.allFnsAlreadyExist(all_fns), rewrite_files) if dt.allFnsAlreadyExist( all_fns) and not rewrite_files or save_results_so_far: print("Skipping task", "Weka Tree") return else: print("Running task", "Weka Tree") for l in range(len(cluster_names)): cluster_names[l] = cluster_names[l].split()[0] """ for l in range(len(label_names)): if label_names[l][:6] == "class-": label_names[l] = label_names[l][6:] """ f1_array = [] accuracy_array = [] labels = labels.transpose() print("labels transposed") print("labels", len(labels), len(labels[0])) if limit_entities is False: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) vectors = np.asarray( dt.match_entities(vectors, limited_labels, vector_names)) all_y_test = [] all_predictions = [] for l in range(len(labels)): if balance: new_vectors, new_labels = dt.balanceClasses(vectors, labels[l]) else: new_vectors = vectors new_labels = labels[l] # Select training data with cross validation ac_y_test = [] ac_y_train = [] ac_x_train = [] ac_x_test = [] ac_y_dev = [] ac_x_dev = [] cv_f1 = [] cv_acc = [] if cv_splits == 1: kf = KFold(n_splits=3, shuffle=False, random_state=None) else: kf = KFold(n_splits=cv_splits, shuffle=False, random_state=None) c = 0 for train, test in kf.split(new_vectors): if split_to_use > -1: if c != split_to_use: c += 1 continue ac_y_test.append(new_labels[test]) ac_y_train.append(new_labels[train[int(len(train) * 0.2):]]) val = int(len(train) * 0.2) t_val = train[val:] nv_t_val = new_vectors[t_val] ac_x_train.append(nv_t_val) ac_x_test.append(new_vectors[test]) ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]]) ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]]) c += 1 if cv_splits == 1: break predictions = [] rules = [] if development: ac_x_test = np.copy(np.asarray(ac_x_dev)) ac_y_test = np.copy(np.asarray(ac_y_dev)) train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt" test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt" for splits in range(len(ac_y_test)): # Get the weka predictions dt.writeArff(ac_x_train[splits], [ac_y_train[splits]], [label_names[splits]], train_fn, header=True) dt.writeArff(ac_x_test[splits], [ac_y_test[splits]], [label_names[splits]], test_fn, header=True) prediction, rule = self.getWekaPredictions( train_fn + label_names[splits] + ".arff", test_fn + label_names[splits] + ".arff", save_details, pruning) predictions.append(prediction) rules.append(rule) for i in range(len(predictions)): if len(predictions) == 1: all_y_test.append(ac_y_test[i]) all_predictions.append(predictions[i]) f1 = f1_score(ac_y_test[i], predictions[i], average="binary") accuracy = accuracy_score(ac_y_test[i], predictions[i]) cv_f1.append(f1) cv_acc.append(accuracy) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print(scores) # Export a tree for each label predicted by the clf, not sure if this is needed... if save_details: data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[ l] + " " + filename + ".txt" class_names = [label_names[l], "NOT " + label_names[l]] #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type) dt.write1dArray(rules[i].split("\n"), data_fn) dot_file = dt.import1dArray(data_fn) new_dot_file = [] for line in dot_file: if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line: line = line.split('"') line[1] = '"' + cluster_names[int(line[1])] + '"' line = "".join(line) new_dot_file.append(line) dt.write1dArray(new_dot_file, data_fn) graph = pydot.graph_from_dot_file(data_fn) graph.write_png("../data/" + data_type + "/rules/weka_images/" + label_names[l] + " " + filename + ".png") f1_array.append(np.average(np.asarray(cv_f1))) accuracy_array.append(np.average(np.asarray(cv_acc))) accuracy_array = np.asarray(accuracy_array) accuracy_average = np.average(accuracy_array) accuracy_array = accuracy_array.tolist() f1_array = np.asarray(f1_array) f1_average = np.average(f1_array) f1_array = f1_array.tolist() micro_average = f1_score(np.asarray(all_y_test), np.asarray(all_predictions), average="micro") print("Micro F1", micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) f1_array.append(f1_average) f1_array.append(micro_average) scores = [accuracy_array, f1_array] dt.write1dArray(accuracy_array, acc_fn) dt.write1dArray(f1_array, f1_fn) print(csv_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", file_names, scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key)
def main(corpus_fn, output_folder): corpus = fetch_20newsgroups(subset='all', shuffle=False, remove=("headers", "footers", "quotes")).data tokenized_corpus, text_corpus = spacyTokenizeLowercase(corpus) np.save("../data/raw/newsgroups/corpus.npy", tokenized_corpus) dt.write1dArray(text_corpus, "../data/raw/newsgroups/corpus_processed.txt")
fid, _, cats = line.partition(' ') doc_index = fileid_mapping[fid] for c in cats.split(): class_index = cat_names[c] class_all[doc_index][class_index] = 1 new_class_all[doc_index][class_index] = 1 print(fid, doc_index, c, class_index) print(class_all.shape) save_path = "../data/raw/reuters/" np.save(save_path + "fileid_mapping.npy", fileid_mapping) np.save(save_path + "category_name_mapping.npy", cat_names) print("cats", len(np.unique(list(cat_names.keys())))) dt.write1dArray(list(cat_names.keys()), save_path + "category_names.txt") names = list(fileid_mapping.keys()) for i in range(len(names)): names[i] = "_".join(names[i].split("/")) dt.write1dArray(names, save_path + "available_entities.txt") print("names", len(np.unique(names))) dt.write2dArray(class_all, save_path + "class-all.txt") dt.write1dArray(docs, save_path + "corpus.txt") print("docs", len(np.unique(docs))) unique_docs, index = np.unique(docs, return_index=True)
def removeClass(folder_name): names = dt.getFns(folder_name) for name in names: if name[:12] == "class-class-": contents = dt.import1dArray(folder_name + name) dt.write1dArray(contents, folder_name + name[6:])
def everythingElse(): np.random.seed(1337) # Get frequencies, PPMI's, classes. Everything needed for directions skip_top = 0 lowest_amt = skip_top highest_amt = 0 index_from = 2 classification = "all" bigrams = True if bigrams is False: (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=highest_amt, skip_top=skip_top, index_from=index_from) else: (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=0, skip_top=0, index_from=index_from) train_len = len(x_train) test_len = len(y_train) vectors = np.concatenate((x_train, x_test), axis=0) classes = np.concatenate((y_train, y_test), axis=0) #vectors = x_train[:int(len(x_train) * 0.8)] #classes = y_train[:int(len(y_train) * 0.8)] word_to_id = imdb.get_word_index() word_to_id = {k:(v+index_from) for k,v in word_to_id.items()} word_to_id["<UNK>"] = 0 word_to_id["<START>"] = 1 word_to_id["<OOV>"] = 2 id_to_word = {value:key for key,value in word_to_id.items()} word_vectors = np.empty(shape=(len(vectors)), dtype=np.object) # Have to recreate original word vectors for s in range(len(vectors)): word_sentence = [] for w in range(len(vectors[s])): word_sentence.append(id_to_word[vectors[s][w]]) word_vectors[s] = word_sentence import gensim.models.phrases phrases = gensim.models.Phrases(word_vectors) bigram = gensim.models.phrases.Phraser(phrases) phrase_vectors = [bigram[sentence] for sentence in word_vectors] from gensim import corpora dictionary = corpora.Dictionary(phrase_vectors) dictionary.filter_extremes(no_below=highest_amt) dfs_list = [] words = [] for i in range(len(dictionary.keys())): words.append(dictionary[i]) dfs_list.append(dictionary.dfs[i]) dt.write1dArray(words, "../data/sentiment/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt") dt.write1dArray(dfs_list, "../data/sentiment/bow/frequency/global/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt") corpus = [dictionary.doc2bow(text) for text in phrase_vectors] all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification all_fn_binary = "../data/sentiment/bow/binary/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification import gensim.matutils corpus = gensim.matutils.corpus2csc(corpus) sp.save_npz(all_fn, corpus) """ #all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification #corpus = sp.load_npz(all_fn + ".npz") print("saving") ppmi = mt.convertPPMI( corpus) ppmi_sparse = sp.csr_matrix(ppmi) ppmi_fn = "../data/sentiment/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification sp.save_npz(ppmi_fn, ppmi_sparse) """ """ for key,val in id_to_word.items(): if val == "that": print(key) ids = np.asarray(list(id_to_word.keys()), dtype=np.int32) words = np.asarray(list(id_to_word.values()), dtype="str") sorted_ids = np.argsort(ids) complete_word_list = words[sorted_ids] word_list = complete_word_list[skip_top+3:highest_amt] new_word_list = [] new_word_list.append("<UNK>") new_word_list.append("<START>") new_word_list.append("<OOV>") for i in range(skip_top): new_word_list.append("<OOV>") for w in word_list: new_word_list.append(w) """ """ for x in x_train: run = False for n in range(len(x)): if x[n] > highest_amt-1000: run = True if run: try: for id in x_train[0]: print(id, end=' ') print("") for id in x_train[0]: print(complete_word_list[id], end=' ') print("") for id in x_train[0]: print(new_word_list[id], end=' ') print("") except KeyError: print("fail") break """ """ import codecs decoding_table = ( '\x00' # 0x00 -> NULL '\x01' # 0x01 -> START OF HEADING '\x02' # 0x02 -> START OF TEXT '\x03' # 0x03 -> END OF TEXT '\x04' # 0x04 -> END OF TRANSMISSION '\x05' # 0x05 -> ENQUIRY '\x06' # 0x06 -> ACKNOWLEDGE '\x07' # 0x07 -> BELL '\x08' # 0x08 -> BACKSPACE '\t' # 0x09 -> HORIZONTAL TABULATION '\n' # 0x0A -> LINE FEED '\x0b' # 0x0B -> VERTICAL TABULATION '\x0c' # 0x0C -> FORM FEED '\r' # 0x0D -> CARRIAGE RETURN '\x0e' # 0x0E -> SHIFT OUT '\x0f' # 0x0F -> SHIFT IN '\x10' # 0x10 -> DATA LINK ESCAPE '\x11' # 0x11 -> DEVICE CONTROL ONE '\x12' # 0x12 -> DEVICE CONTROL TWO '\x13' # 0x13 -> DEVICE CONTROL THREE '\x14' # 0x14 -> DEVICE CONTROL FOUR '\x15' # 0x15 -> NEGATIVE ACKNOWLEDGE '\x16' # 0x16 -> SYNCHRONOUS IDLE '\x17' # 0x17 -> END OF TRANSMISSION BLOCK '\x18' # 0x18 -> CANCEL '\x19' # 0x19 -> END OF MEDIUM '\x1a' # 0x1A -> SUBSTITUTE '\x1b' # 0x1B -> ESCAPE '\x1c' # 0x1C -> FILE SEPARATOR '\x1d' # 0x1D -> GROUP SEPARATOR '\x1e' # 0x1E -> RECORD SEPARATOR '\x1f' # 0x1F -> UNIT SEPARATOR ' ' # 0x20 -> SPACE '!' # 0x21 -> EXCLAMATION MARK '"' # 0x22 -> QUOTATION MARK '#' # 0x23 -> NUMBER SIGN '$' # 0x24 -> DOLLAR SIGN '%' # 0x25 -> PERCENT SIGN '&' # 0x26 -> AMPERSAND "'" # 0x27 -> APOSTROPHE '(' # 0x28 -> LEFT PARENTHESIS ')' # 0x29 -> RIGHT PARENTHESIS '*' # 0x2A -> ASTERISK '+' # 0x2B -> PLUS SIGN ',' # 0x2C -> COMMA '-' # 0x2D -> HYPHEN-MINUS '.' # 0x2E -> FULL STOP '/' # 0x2F -> SOLIDUS '0' # 0x30 -> DIGIT ZERO '1' # 0x31 -> DIGIT ONE '2' # 0x32 -> DIGIT TWO '3' # 0x33 -> DIGIT THREE '4' # 0x34 -> DIGIT FOUR '5' # 0x35 -> DIGIT FIVE '6' # 0x36 -> DIGIT SIX '7' # 0x37 -> DIGIT SEVEN '8' # 0x38 -> DIGIT EIGHT '9' # 0x39 -> DIGIT NINE ':' # 0x3A -> COLON ';' # 0x3B -> SEMICOLON '<' # 0x3C -> LESS-THAN SIGN '=' # 0x3D -> EQUALS SIGN '>' # 0x3E -> GREATER-THAN SIGN '?' # 0x3F -> QUESTION MARK '@' # 0x40 -> COMMERCIAL AT 'A' # 0x41 -> LATIN CAPITAL LETTER A 'B' # 0x42 -> LATIN CAPITAL LETTER B 'C' # 0x43 -> LATIN CAPITAL LETTER C 'D' # 0x44 -> LATIN CAPITAL LETTER D 'E' # 0x45 -> LATIN CAPITAL LETTER E 'F' # 0x46 -> LATIN CAPITAL LETTER F 'G' # 0x47 -> LATIN CAPITAL LETTER G 'H' # 0x48 -> LATIN CAPITAL LETTER H 'I' # 0x49 -> LATIN CAPITAL LETTER I 'J' # 0x4A -> LATIN CAPITAL LETTER J 'K' # 0x4B -> LATIN CAPITAL LETTER K 'L' # 0x4C -> LATIN CAPITAL LETTER L 'M' # 0x4D -> LATIN CAPITAL LETTER M 'N' # 0x4E -> LATIN CAPITAL LETTER N 'O' # 0x4F -> LATIN CAPITAL LETTER O 'P' # 0x50 -> LATIN CAPITAL LETTER P 'Q' # 0x51 -> LATIN CAPITAL LETTER Q 'R' # 0x52 -> LATIN CAPITAL LETTER R 'S' # 0x53 -> LATIN CAPITAL LETTER S 'T' # 0x54 -> LATIN CAPITAL LETTER T 'U' # 0x55 -> LATIN CAPITAL LETTER U 'V' # 0x56 -> LATIN CAPITAL LETTER V 'W' # 0x57 -> LATIN CAPITAL LETTER W 'X' # 0x58 -> LATIN CAPITAL LETTER X 'Y' # 0x59 -> LATIN CAPITAL LETTER Y 'Z' # 0x5A -> LATIN CAPITAL LETTER Z '[' # 0x5B -> LEFT SQUARE BRACKET '\\' # 0x5C -> REVERSE SOLIDUS ']' # 0x5D -> RIGHT SQUARE BRACKET '^' # 0x5E -> CIRCUMFLEX ACCENT '_' # 0x5F -> LOW LINE '`' # 0x60 -> GRAVE ACCENT 'a' # 0x61 -> LATIN SMALL LETTER A 'b' # 0x62 -> LATIN SMALL LETTER B 'c' # 0x63 -> LATIN SMALL LETTER C 'd' # 0x64 -> LATIN SMALL LETTER D 'e' # 0x65 -> LATIN SMALL LETTER E 'f' # 0x66 -> LATIN SMALL LETTER F 'g' # 0x67 -> LATIN SMALL LETTER G 'h' # 0x68 -> LATIN SMALL LETTER H 'i' # 0x69 -> LATIN SMALL LETTER I 'j' # 0x6A -> LATIN SMALL LETTER J 'k' # 0x6B -> LATIN SMALL LETTER K 'l' # 0x6C -> LATIN SMALL LETTER L 'm' # 0x6D -> LATIN SMALL LETTER M 'n' # 0x6E -> LATIN SMALL LETTER N 'o' # 0x6F -> LATIN SMALL LETTER O 'p' # 0x70 -> LATIN SMALL LETTER P 'q' # 0x71 -> LATIN SMALL LETTER Q 'r' # 0x72 -> LATIN SMALL LETTER R 's' # 0x73 -> LATIN SMALL LETTER S 't' # 0x74 -> LATIN SMALL LETTER T 'u' # 0x75 -> LATIN SMALL LETTER U 'v' # 0x76 -> LATIN SMALL LETTER V 'w' # 0x77 -> LATIN SMALL LETTER W 'x' # 0x78 -> LATIN SMALL LETTER X 'y' # 0x79 -> LATIN SMALL LETTER Y 'z' # 0x7A -> LATIN SMALL LETTER Z '{' # 0x7B -> LEFT CURLY BRACKET '|' # 0x7C -> VERTICAL LINE '}' # 0x7D -> RIGHT CURLY BRACKET '~' # 0x7E -> TILDE '\x7f' # 0x7F -> DELETE '\u20ac' # 0x80 -> EURO SIGN '\ufffe' # 0x81 -> UNDEFINED '\u201a' # 0x82 -> SINGLE LOW-9 QUOTATION MARK '\u0192' # 0x83 -> LATIN SMALL LETTER F WITH HOOK '\u201e' # 0x84 -> DOUBLE LOW-9 QUOTATION MARK '\u2026' # 0x85 -> HORIZONTAL ELLIPSIS '\u2020' # 0x86 -> DAGGER '\u2021' # 0x87 -> DOUBLE DAGGER '\u02c6' # 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT '\u2030' # 0x89 -> PER MILLE SIGN '\u0160' # 0x8A -> LATIN CAPITAL LETTER S WITH CARON '\u2039' # 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK '\u0152' # 0x8C -> LATIN CAPITAL LIGATURE OE '\ufffe' # 0x8D -> UNDEFINED '\u017d' # 0x8E -> LATIN CAPITAL LETTER Z WITH CARON '\ufffe' # 0x8F -> UNDEFINED '\ufffe' # 0x90 -> UNDEFINED '\u2018' # 0x91 -> LEFT SINGLE QUOTATION MARK '\u2019' # 0x92 -> RIGHT SINGLE QUOTATION MARK '\u201c' # 0x93 -> LEFT DOUBLE QUOTATION MARK '\u201d' # 0x94 -> RIGHT DOUBLE QUOTATION MARK '\u2022' # 0x95 -> BULLET '\u2013' # 0x96 -> EN DASH '\u2014' # 0x97 -> EM DASH '\u02dc' # 0x98 -> SMALL TILDE '\u2122' # 0x99 -> TRADE MARK SIGN '\u0161' # 0x9A -> LATIN SMALL LETTER S WITH CARON '\u203a' # 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK '\u0153' # 0x9C -> LATIN SMALL LIGATURE OE '\ufffe' # 0x9D -> UNDEFINED '\u017e' # 0x9E -> LATIN SMALL LETTER Z WITH CARON '\u0178' # 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS '\xa0' # 0xA0 -> NO-BREAK SPACE '\xa1' # 0xA1 -> INVERTED EXCLAMATION MARK '\xa2' # 0xA2 -> CENT SIGN '\xa3' # 0xA3 -> POUND SIGN '\xa4' # 0xA4 -> CURRENCY SIGN '\xa5' # 0xA5 -> YEN SIGN '\xa6' # 0xA6 -> BROKEN BAR '\xa7' # 0xA7 -> SECTION SIGN '\xa8' # 0xA8 -> DIAERESIS '\xa9' # 0xA9 -> COPYRIGHT SIGN '\xaa' # 0xAA -> FEMININE ORDINAL INDICATOR '\xab' # 0xAB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK '\xac' # 0xAC -> NOT SIGN '\xad' # 0xAD -> SOFT HYPHEN '\xae' # 0xAE -> REGISTEREvectorsD SIGN '\xaf' # 0xAF -> MACRON '\xb0' # 0xB0 -> DEGREE SIGN '\xb1' # 0xB1 -> PLUS-MINUS SIGN '\xb2' # 0xB2 -> SUPERSCRIPT TWO '\xb3' # 0xB3 -> SUPERSCRIPT THREE '\xb4' # 0xB4 -> ACUTE ACCENT '\xb5' # 0xB5 -> MICRO SIGN '\xb6' # 0xB6 -> PILCROW SIGN '\xb7' # 0xB7 -> MIDDLE DOT '\xb8' # 0xB8 -> CEDILLA '\xb9' # 0xB9 -> SUPERSCRIPT ONE '\xba' # 0xBA -> MASCULINE ORDINAL INDICATOR '\xbb' # 0xBB -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK '\xbc' # 0xBC -> VULGAR FRACTION ONE QUARTER '\xbd' # 0xBD -> VULGAR FRACTION ONE HALF '\xbe' # 0xBE -> VULGAR FRACTION THREE QUARTERS '\xbf' # 0xBF -> INVERTED QUESTION MARK '\xc0' # 0xC0 -> LATIN CAPITAL LETTER A WITH GRAVE '\xc1' # 0xC1 -> LATIN CAPITAL LETTER A WITH ACUTE '\xc2' # 0xC2 -> LATIN CAPITAL LETTER A WITH CIRCUMFLEX '\xc3' # 0xC3 -> LATIN CAPITAL LETTER A WITH TILDE '\xc4' # 0xC4 -> LATIN CAPITAL LETTER A WITH DIAERESIS '\xc5' # 0xC5 -> LATIN CAPITAL LETTER A WITH RING ABOVE '\xc6' # 0xC6 -> LATIN CAPITAL LETTER AE '\xc7' # 0xC7 -> LATIN CAPITAL LETTER C WITH CEDILLA '\xc8' # 0xC8 -> LATIN CAPITAL LETTER E WITH GRAVE '\xc9' # 0xC9 -> LATIN CAPITAL LETTER E WITH ACUTE '\xca' # 0xCA -> LATIN CAPITAL LETTER E WITH CIRCUMFLEX '\xcb' # 0xCB -> LATIN CAPITAL LETTER E WITH DIAERESIS '\xcc' # 0xCC -> LATIN CAPITAL LETTER I WITH GRAVE '\xcd' # 0xCD -> LATIN CAPITAL LETTER I WITH ACUTE '\xce' # 0xCE -> LATIN CAPITAL LETTER I WITH CIRCUMFLEX '\xcf' # 0xCF -> LATIN CAPITAL LETTER I WITH DIAERESIS '\xd0' # 0xD0 -> LATIN CAPITAL LETTER ETH '\xd1' # 0xD1 -> LATIN CAPITAL LETTER N WITH TILDE '\xd2' # 0xD2 -> LATIN CAPITAL LETTER O WITH GRAVE '\xd3' # 0xD3 -> LATIN CAPITAL LETTER O WITH ACUTE '\xd4' # 0xD4 -> LATIN CAPITAL LETTER O WITH CIRCUMFLEX '\xd5' # 0xD5 -> LATIN CAPITAL LETTER O WITH TILDE '\xd6' # 0xD6 -> LATIN CAPITAL LETTER O WITH DIAERESIS '\xd7' # 0xD7 -> MULTIPLICATION SIGN '\xd8' # 0xD8 -> LATIN CAPITAL LETTER O WITH STROKE '\xd9' # 0xD9 -> LATIN CAPITAL LETTER U WITH GRAVE '\xda' # 0xDA -> LATIN CAPITAL LETTER U WITH ACUTE '\xdb' # 0xDB -> LATIN CAPITAL LETTER U WITH CIRCUMFLEX '\xdc' # 0xDC -> LATIN CAPITAL LETTER U WITH DIAERESIS '\xdd' # 0xDD -> LATIN CAPITAL LETTER Y WITH ACUTE '\xde' # 0xDE -> LATIN CAPITAL LETTER THORN '\xdf' # 0xDF -> LATIN SMALL LETTER SHARP S '\xe0' # 0xE0 -> LATIN SMALL LETTER A WITH GRAVE '\xe1' # 0xE1 -> LATIN SMALL LETTER A WITH ACUTE '\xe2' # 0xE2 -> LATIN SMALL LETTER A WITH CIRCUMFLEX '\xe3' # 0xE3 -> LATIN SMALL LETTER A WITH TILDE '\xe4' # 0xE4 -> LATIN SMALL LETTER A WITH DIAERESIS '\xe5' # 0xE5 -> LATIN SMALL LETTER A WITH RING ABOVE '\xe6' # 0xE6 -> LATIN SMALL LETTER AE '\xe7' # 0xE7 -> LATIN SMALL LETTER C WITH CEDILLA '\xe8' # 0xE8 -> LATIN SMALL LETTER E WITH GRAVE '\xe9' # 0xE9 -> LATIN SMALL LETTER E WITH ACUTE '\xea' # 0xEA -> LATIN SMALL LETTER E WITH CIRCUMFLEX '\xeb' # 0xEB -> LATIN SMALL LETTER E WITH DIAERESIS '\xec' # 0xEC -> LATIN SMALL LETTER I WITH GRAVE '\xed' # 0xED -> LATIN SMALL LETTER I WITH ACUTE '\xee' # 0xEE -> LATIN SMALL LETTER I WITH CIRCUMFLEX '\xef' # 0xEF -> LATIN SMALL LETTER I WITH DIAERESIS '\xf0' # 0xF0 -> LATIN SMALL LETTER ETH '\xf1' # 0xF1 -> LATIN SMALL LETTER N WITH TILDE '\xf2' # 0xF2 -> LATIN SMALL LETTER O WITH GRAVE '\xf3' # 0xF3 -> LATIN SMALL LETTER O WITH ACUTE '\xf4' # 0xF4 -> LATIN SMALL LETTER O WITH CIRCUMFLEX '\xf5' # 0xF5 -> LATIN SMALL LETTER O WITH TILDE '\xf6' # 0xF6 -> LATIN SMALL LETTER O WITH DIAERESIS '\xf7' # 0xF7 -> DIVISION SIGN '\xf8' # 0xF8 -> LATIN SMALL LETTER O WITH STROKE '\xf9' # 0xF9 -> LATIN SMALL LETTER U WITH GRAVE '\xfa' # 0xFA -> LATIN SMALL LETTER U WITH ACUTE '\xfb' # 0xFB -> LATIN SMALL LETTER U WITH CIRCUMFLEX '\xfc' # 0xFC -> LATIN SMALL LETTER U WITH DIAERESIS '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE '\xfe' # 0xFE -> LATIN SMALL LETTER THORN '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS ) import collections encoding_table=codecs.charmap_build(decoding_table) mapped_ids = collections.OrderedDict() del_ind = [] amt_of_encs = 0 for w in range(len(word_list)): try: for char in word_list[w]: codecs.charmap_encode(char,"strict", encoding_table) except UnicodeEncodeError: amt_of_encs += 1 del_ind.append(w) print(word_list[w], w) mapped_ids[skip_top+3+w] = w - amt_of_encs word_list = np.delete(word_list, del_ind) print("----------------------------------------") dt.write1dArray(word_list, "../data/sentiment/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt") all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification all_fn_binary = "../data/sentiment/bow/binary/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification tf = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32) tf_binary = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32) for ds in range(len(vectors)): # d for document sequence for wi in range(len(vectors[ds])): # every word id in the sequence if vectors[ds][wi] >= skip_top+ 3: new_id = mapped_ids[vectors[ds][wi]] print(ds, new_id) tf[ds][new_id] += 1 tf_binary[ds][new_id] = 1 print("transposing") tf = np.asarray(tf, dtype="int").transpose() tf_binary = np.asarray(tf_binary, dtype="int").transpose() tf_sparse = sp.csr_matrix(tf) tf_binary_sparse = sp.csr_matrix(tf_binary) sp.save_npz(all_fn_binary, tf_binary_sparse) sp.save_npz(all_fn, tf_sparse) print("saving") #mt.printIndividualFromAll("sentiment", "frequency/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=word_list) #mt.printIndividualFromAll("sentiment", "binary/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn_binary, names_array=word_list) ppmi_fn = "../data/sentiment/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification #if dt.fileExists(ppmi_fn) is False: ppmi = mt.convertPPMI( tf_sparse) ppmi_sparse = sp.csr_matrix(ppmi) sp.save_npz(ppmi_fn, ppmi_sparse) dt.write2dArray(ppmi, ppmi_fn) #mt.printIndividualFromAll("sentiment", "ppmi", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=word_list) dt.write2dArray(tf_binary, all_fn_binary) dt.write2dArray(tf, all_fn) """ print("1") classes = np.asarray(classes, dtype=np.int32) print(2) print(3) print(4) names = ["sentiment"] dt.write1dArray(names, "../data/sentiment/classify/sentiment/names.txt") dt.write1dArray(classes, "../data/sentiment/classify/sentiment/class-" + "sentiment") dt.write1dArray(classes,"../data/sentiment/classify/sentiment/class-all") dt.write1dArray(list(range(len(classes))), "../data/sentiment/classify/sentiment/available_entities.txt")
def parseTree(tree_fn, output_fn, entity_names_fn): data_type = "placetypes" class_name = "opencyc" entity_names = dt.import1dArray(entity_names_fn) with open(tree_fn, "r") as infile: tree = [line for line in infile] tree = tree[1:] indexes_to_delete = [] for l in range(len(tree)): tree[l] = re.sub(r'\s\*', ' ', tree[l]) if "DELETE" in tree[l]: indexes_to_delete.append(l) tree = np.delete(tree, indexes_to_delete) entities_classes = {} for l in range(len(tree)): removed_asterisk = re.sub(r'\*', ' ', tree[l]) stripped = removed_asterisk.strip() entities_classes[stripped] = [] classes = [] current_tabs = 0 current_tabs_index = 0 current_tab_class = [] class_names = [] next_index = 0 for l in range(len(tree)-1): removed_asterisk = re.sub(r'\*', ' ', tree[l]) entity = removed_asterisk.strip() tabs = len(tree[l]) - len(tree[l].strip()) next_tabs = len(tree[l+1]) - len(tree[l+1].strip()) print("TRY", entity, tabs, next_tabs) # If the tree has a subclass if (next_tabs) > tabs and tabs <= 4: print("START", entity, tabs, next_tabs) for j in range(l+1, len(tree)): inner_tabs = len(tree[j]) - len(tree[j].strip()) removed_asterisk = re.sub(r'\*', ' ', tree[j]) inner_entity = removed_asterisk.strip() print("ADD", inner_entity) if inner_tabs <= tabs: print("END", inner_tabs, tabs) break else: entities_classes[entity].append(inner_entity) print("found", inner_entity, "added to", entity) found_entities = [] found_arrays = [] class_names = [] for key, value in list(entities_classes.items()): if len(value) < 30: del entities_classes[key] continue """ Removing entities that aren't in a list found = False for e in entity_names: if key == e: found = True if not found: del entities_classes[key] continue """ for v in value: found_entities.append(v) found_arrays.append(value) class_names.append(key) found_entities = np.unique(np.asarray(found_entities)) dt.write1dArray(found_entities, "../data/"+data_type+"/classify/"+class_name+"/available_entities.txt") # Sort keys and values index = np.argsort(class_names) sorted_class_names = [] sorted_value_names = [] for i in index: sorted_class_names.append(class_names[i]) sorted_value_names.append(found_arrays[i]) value_indexes = [] # Convert values to indexes for v in range(len(sorted_vaentity_name_fnlue_names)): value_index = [] for g in range(len(sorted_value_names[v])): for e in range(len(found_entities)): if sorted_value_names[v][g] == found_entities[e]: value_index.append(e) value_indexes.append(value_index) matrix = np.asarray([[0]* len(entities_classes)]*len(found_entities)) for c in range(len(sorted_class_names)): print(c) print("-------------------") for v in value_indexes[c]: print(v) matrix[v, c] = 1 dt.write1dArray(matrix[c], "../data/placetypes/classify/opencyc/class-" + sorted_class_names[c]) matrix = np.asarray(matrix) dt.write2dArray(matrix, "../data/placetypes/classify/opencyc/class-all")
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/", logistic_regression=False, sparse_array_fn=None, only_these_fn=None): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt" FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt" TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt" FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if sparse_array_fn is not None: sparse_array = dt.import2dArray(sparse_array_fn) else: sparse_array = None if sparse_array is not None: for s in range(len(sparse_array)): if len(np.nonzero(sparse_array[s])[0]) <= 1: print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0])) else: print(len(np.nonzero(sparse_array[s])[0])) if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if only_these_fn is not None: only_these = dt.import1dArray(only_these_fn, "s") inds = [] for s in range(len(property_names)): for o in only_these: if property_names[s] == o: inds.append(s) break sparse_array = sparse_array[inds] property_names = property_names[inds] if self.get_f1 is False: print("running svms") kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads, logistic_regression, sparse_array) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(f1_scores, ktau_scores_fn) dt.write1dArray(accs, acc_fn) dt.write1dArray(TPs, TP_fn) dt.write1dArray(FPs, FP_fn) dt.write1dArray(TNs, TN_fn) dt.write1dArray(FNs, FN_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def main(data_type, output_folder, grams, no_below, no_above, bowmin): if data_type == "newsgroups": newsgroups = fetch_20newsgroups(subset='all', shuffle=False, remove=("headers", "footers", "quotes")) corpus = newsgroups.data classes = newsgroups.target encoding_type = "utf8" elif data_type == "sentiment": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=0, skip_top=0, index_from=0, seed=113) corpus = np.concatenate((x_train, x_test), axis=0) classes = np.concatenate((y_train, y_test), axis=0) corpus = makeCorpusFromIds(corpus, imdb.get_word_index()) encoding_type = "utf8" else: corpus = dt.import1dArray(output_folder + "duplicate_removed_docs.txt") classes = dt.import2dArray( output_folder + "duplicate_removed_classes.txt", "i") encoding_type = "utf8" file_name = "simple_numeric" processed_corpus = preprocess(corpus) tokenized_corpus = naiveTokenizer(processed_corpus) vocab, dct, id2token = getVocab(tokenized_corpus) processed_corpus, tokenized_corpus, remove_ind, classes = removeEmpty( processed_corpus, tokenized_corpus, classes) bow, bow_vocab = doc2bow(tokenized_corpus, dct, bowmin) print(bowmin, len(list(bow_vocab.keys())), "|||", bow.shape) filtered_bow, word_list, filtered_vocab = filterBow( tokenized_corpus, dct, no_below, no_above) tokenized_ids = tokensToIds(tokenized_corpus, vocab) print(output_folder + file_name + "_remove.npy") np.save(output_folder + file_name + "_remove.npy", remove_ind) np.save(output_folder + file_name + "_corpus.npy", tokenized_corpus) np.save(output_folder + file_name + "_tokenized_corpus.npy", tokenized_ids) np.save(output_folder + file_name + "_vocab " + str(bowmin) + ".npy", bow_vocab) np.save(output_folder + file_name + "_filtered_vocab.npy", filtered_vocab) dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed.txt", encoding=encoding_type) np.save(output_folder + file_name + "_classes.npy", classes) if data_type != "reuters": np.save(output_folder + file_name + "_classes_categorical.npy", to_categorical(classes)) sp.save_npz(output_folder + file_name + ".npz", bow) dt.write1dArray(word_list, output_folder + file_name + "_words.txt", encoding=encoding_type) dt.write1dArray(list(bow_vocab.keys()), output_folder + file_name + "_all_words_2.txt", encoding=encoding_type) """ if grams > 0: for i in range(2, grams): # Up to 5-length grams processed_corpus, tokenized_corpus = ngrams(tokenized_corpus) vocab, dct, id2token = getVocab(tokenized_corpus) bow = doc2bow(tokenized_corpus, dct, 100, 10) tokenized_ids = tokensToIds(tokenized_corpus, vocab) np.save(output_folder + file_name + "_corpus " + str(i) + "-gram" + ".npy", tokenized_corpus) np.save(output_folder + file_name + "_tokenized_corpus " + str(i) + "-gram" + ".npy", tokenized_ids) np.save(output_folder + file_name + "_vocab " + str(i) + "-gram" + ".npy", vocab) dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed " + str(i) + "-gram" + ".txt") sp.save_npz(output_folder + file_name + "_bow " + str(i) + "-gram" + ".npz", bow) dt.write1dArray(word_list, output_folder + file_name + "_words.txt") """ file_name += "_stopwords" filtered_ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + file_name + "_ppmi " + str( no_below) + "-" + str(no_above) + "-all.npz" ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + file_name + "_ppmi " + str( bowmin) + "-all.npz" bow_fn = "../data/" + data_type + "/bow/frequency/phrases/" + file_name + "_bow " + str( bowmin) + "-all.npz" filtered_bow_fn = "../data/" + data_type + "/bow/frequency/phrases/" + file_name + "_bow " + str( no_below) + "-" + str(no_above) + "-all.npz" # Re-intialize so that we don't start with an already filtered corpus tokenized_corpus, processed_corpus = removeStopWords(tokenized_corpus) processed_corpus, tokenized_corpus, remove_ind, classes = removeEmpty( processed_corpus, tokenized_corpus, classes) vocab, dct, id2token = getVocab(tokenized_corpus) bow, bow_vocab = doc2bow(tokenized_corpus, dct, bowmin) print(bowmin, len(list(bow_vocab.keys())), "|||", bow.shape) filtered_bow, word_list, filtered_vocab = filterBow( tokenized_corpus, dct, no_below, no_above) tokenized_ids = tokensToIds(tokenized_corpus, vocab) print(output_folder + file_name + "_remove.npy") print(output_folder + file_name + "_corpus.npy") print(output_folder + file_name + "_tokenized_corpus.npy") print(output_folder + file_name + "_id2token.npy") print(output_folder + file_name + "_vocab " + str(bowmin) + ".npy") print(output_folder + file_name + "_corpus_processed.txt") print(output_folder + file_name + "_classes.npy") print(output_folder + file_name + "_classes_categorical.npy") np.save(output_folder + file_name + "id2token.npy", id2token) np.save(output_folder + file_name + "_remove.npy", remove_ind) np.save(output_folder + file_name + "_vocab " + str(bowmin) + ".npy", bow_vocab) np.save(output_folder + file_name + "_filtered_vocab.npy", filtered_vocab) np.save(output_folder + file_name + "_corpus.npy", tokenized_corpus) np.save(output_folder + file_name + "_tokenized_corpus.npy", tokenized_ids) dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed.txt", encoding=encoding_type) np.save(output_folder + file_name + "_classes.npy", classes) if data_type != "reuters": np.save(output_folder + file_name + "_classes_categorical.npy", to_categorical(classes)) print("------------------- Saved most, moving to PPMI etc", file_name) print(bow_fn) print(filtered_bow_fn) sp.save_npz(bow_fn, bow) sp.save_npz(filtered_bow_fn, filtered_bow) dt.write1dArray(word_list, "../data/" + data_type + "/bow/names/" + file_name + "_words " + str(no_below) + "-" + str(no_above) + "-all.txt", encoding=encoding_type) dt.write1dArray(list(bow_vocab.keys()), "../data/" + data_type + "/bow/names/" + file_name + "all_words_2_no_sw.txt", encoding=encoding_type) filtered_bow = filtered_bow.transpose() ppmi = sparse_ppmi.convertPPMISparse(filtered_bow) filtered_ppmi_sparse = sp.csr_matrix(ppmi).transpose() print(filtered_ppmi_fn) sp.save_npz(filtered_ppmi_fn, filtered_ppmi_sparse) if data_type == "reuters": testAll(["filtered_freq_bow", "filtered_ppmi_bow"], [ filtered_ppmi_sparse.transpose().todense(), filtered_bow.todense() ], [classes, classes], data_type) else: testAll(["filtered_freq_bow", "filtered_ppmi_bow"], [ filtered_ppmi_sparse.transpose().todense(), filtered_bow.todense() ], [to_categorical(classes), to_categorical(classes)], data_type) # Create PCA #classes = dt.import2dArray("../data/movies/classify/genres/class-all", "i") #bow = sp.csr_matrix(dt.import2dArray("../data/movies/bow/frequency/phrases/class-all-15-5-genres", "i")).transpose() ppmi = sparse_ppmi.convertPPMISparse(bow) ppmi_sparse = sp.csr_matrix(ppmi).transpose() print(ppmi_fn) sp.save_npz(ppmi_fn, ppmi_sparse) pca_size = [50, 100, 200] for p in pca_size: pca_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + "_ppmi " + str( bowmin) + " S" + str(p) + "-all.npy" PCA_ppmi = getPCA(ppmi_sparse, p) np.save(pca_fn, PCA_ppmi) """ if grams > 0: for i in range(2, grams+1): # Up to 5-length grams filtered_ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str( grams) + "-gram" + str(no_below) + "-" + str( no_above) + "-all.npz" ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str( grams) + "-gram2" + "-all.npz" bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str( grams) + "-gram2" + "-all.npz" filtered_bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str( grams) + "-gram" + str( no_below) + \ "-" + str(no_above) + "-all.npz" processed_corpus, tokenized_corpus = ngrams(tokenized_corpus) vocab, dct = getVocab(tokenized_corpus) bow = doc2bow(tokenized_corpus, dct, 0) filtered_bow, word_list = filterBow(tokenized_corpus, dct, no_below-bowmin, no_above) tokenized_ids = tokensToIds(tokenized_corpus, vocab) np.save(output_folder + file_name + "_corpus " + str(i) + "-gram" + ".npy", tokenized_corpus) np.save(output_folder + file_name + "_tokenized_corpus " + str(i) + "-gram" + ".npy", tokenized_ids) np.save(output_folder + file_name + "_vocab " + str(i) + "-gram" + ".npy", vocab) dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed " + str(i) + "-gram" + ".txt") sp.save_npz(bow_fn, bow) sp.save_npz(filtered_bow_fn, filtered_bow) dt.write1dArray(word_list, "../data/"+data_type+"/bow/names/" + file_name + "_words " + str(i) + "-gram" + str(no_below) + "-" + str(no_above) + "-all.txt") filtered_bow = filtered_bow.transpose() ppmi = sparse_ppmi.convertPPMISparse(filtered_bow) filtered_ppmi_sparse = sp.csr_matrix(ppmi).transpose() sp.save_npz(filtered_ppmi_fn, filtered_ppmi_sparse) # Create PCA bow = bow.transpose() ppmi = sparse_ppmi.convertPPMISparse(bow) ppmi_sparse = sp.csr_matrix(ppmi).transpose() sp.save_npz(ppmi_fn, ppmi_sparse) pca_fn = "../data/"+data_type+"/nnet/spaces/" + file_name + "_ppmi " + str(grams) + "-gram" + str( no_below) + "-" + str( no_above) + "-all.npy" PCA_ppmi = getPCA(ppmi_sparse, 100) np.save(pca_fn, PCA_ppmi) """ """ file_name += "_stopwords" filtered_ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str(no_below) + "-" + str( no_above) + "-all.npz" filtered_bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str( no_below) + "-" + str(no_above) + "-all.npz" pca_fn = "../data/"+data_type+"/nnet/spaces/" + file_name + "_ppmi " + str( no_below) + "-" + str( no_above) + "-all.npy" filtered_ppmi_sparse = sp.load_npz(filtered_ppmi_fn) PCA_ppmi = np.load(pca_fn) filtered_bow = sp.load_npz(filtered_bow_fn) """ # Create averaged word vectors if data_type == "reuters": testAll(["ppmi_pca"], [PCA_ppmi], [classes], data_type) else: testAll(["ppmi_pca"], [PCA_ppmi], [to_categorical(classes)], data_type)
def __init__(self, training_data=10000, class_path=None, network_type="ft", randomize_finetune_weights=False, dropout_noise=None, amount_of_hidden=0, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, past_model_bias_fn=None, identity_swap=False, reg=0.0, amount_of_finetune=1, output_size=25, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", deep_size=None, corrupt_finetune_weights=False, hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, is_identity=False, activity_reg=0.0, finetune_size=0, data_type="movies", optimizer_name="rmsprop", noise=0.0, fine_tune_weights_fn=None, past_model_weights_fn=None, from_ae=True, class_outputs=False, finetune_activation="linear"): self.model = Sequential() self.training_data = training_data self.class_path = class_path self.learn_rate = learn_rate self.epochs = epochs self.loss = loss self.batch_size = batch_size self.hidden_activation = hidden_activation self.layer_init = layer_init self.output_activation = output_activation self.hidden_layer_size = hidden_layer_size self.file_name = file_name self.vector_path = vector_path self.dropout_noise = dropout_noise self.finetune_size = finetune_size self.class_outputs = class_outputs self.reg = reg self.activity_reg = activity_reg self.activity_reg = activity_reg self.amount_of_finetune = amount_of_finetune self.amount_of_hidden = amount_of_hidden self.output_size = output_size self.finetune_activation = finetune_activation print(data_type) if optimizer_name == "adagrad": self.optimizer = Adagrad() else: self.optimizer = SGD(lr=learn_rate, momentum=0.0, decay=0.0, nesterov=False) entity_vectors, entity_classes = None, None if network_type == "ft": entity_vectors, entity_classes = self.fineTuneNetwork( past_model_weights_fn, past_model_bias_fn, fine_tune_weights_fn, is_identity, identity_swap, randomize_finetune_weights, corrupt_finetune_weights, deep_size, from_ae) elif network_type == "da": entity_vectors, entity_classes = self.denoisingAutoencoder( noise, deep_size) x_train, x_test, y_train, y_test = train_test_split(entity_vectors, entity_classes, test_size=0.3, random_state=0) #x_train, y_train = dt.balance2dClasses(x_train, y_train, 1) # Compile the model and fit it to the data self.model.fit(x_train, y_train, nb_epoch=self.epochs, batch_size=self.batch_size, verbose=1) if network_type == "ft": if class_outputs: scores = [] y_pred = self.model.predict(x_test) y_pred[y_pred >= 0.5] = 1 y_pred[y_pred < 0.5] = 0 f1 = f1_score(y_test, y_pred, average="macro") accuracy_array = [] for y in range(len(y_pred)): accuracy_array.append(accuracy_score(y_test[y], y_pred[y])) accuracy = np.mean(accuracy_array) scores.append(f1) scores.append(accuracy) dt.write1dArray( scores, "../data/" + data_type + "/nnet/scores/" + self.file_name + ".txt") print(scores) self.output_clusters = self.model.predict(entity_vectors) dt.write2dArray( self.output_clusters.transpose(), "../data/" + data_type + "/nnet/clusters/" + self.file_name + ".txt") total_file_name = "../data/" + data_type + "/nnet/spaces/" + self.file_name for l in range(0, len(self.model.layers) - 1): if dropout_noise is not None or dropout_noise > 0.0: if l % 2 == 1: continue print("Writing", l, "layer") truncated_model = Sequential() for a in range(l + 1): truncated_model.add(self.model.layers[a]) truncated_model.compile(loss=self.loss, optimizer="sgd") self.end_space = truncated_model.predict(entity_vectors) dt.write2dArray(self.end_space, total_file_name + "L" + str(l) + ".txt") for l in range(len(self.model.layers)): try: dt.write2dArray( self.model.layers[l].get_weights()[0], "../data/" + data_type + "/nnet/weights/L" + str(l) + file_name + ".txt") dt.write1dArray( self.model.layers[l].get_weights()[1], "../data/" + data_type + "/nnet/bias/L" + str(l) + file_name + ".txt") except IndexError: print("Layer ", str(l), "Failed")
def getVectors(input_folder, file_names_fn, extension, output_folder, only_words_in_x_entities, words_without_x_entities, cut_first_line=False, get_all=False, additional_name="", make_individual=True, classification="", use_all_files="", minimum_words=0, data_type="", sparse_matrix=False, word_count_amt = 0): if use_all_files is None: file_names = dt.import1dArray(file_names_fn) else: file_names = dt.getFns(use_all_files) phrase_dict = defaultdict(int) failed_indexes = [] failed_filenames = [] working_filenames = [] # First, get all possible phrase names and build a dictionary of them from the files for f in range(len(file_names)): try: full_name = input_folder + file_names[f] + "." + extension phrase_list = dt.import2dArray(full_name, "s") if cut_first_line: phrase_list = phrase_list[1:] word_count = 0 for p in phrase_list: word_count += int(p[1]) if word_count > word_count_amt: for p in phrase_list: if p[0] != "all": phrase_dict[p[0]] += 1 else: print("found class all") working_filenames.append(file_names[f]) else: print("Failed, <1k words", file_names[f], f, word_count) failed_filenames.append(file_names[f]) failed_indexes.append(f) except FileNotFoundError: print("Failed to find", file_names[f], f) failed_filenames.append(file_names[f]) failed_indexes.append(f) print(failed_indexes) print(failed_filenames) phrase_sets = [] # Convert to array so we can sort it phrase_list = [] entity_names = dt.import1dArray(file_names_fn) matching_filenames = [] failed_fns = [] if data_type == "wines": for e in entity_names: found = False for f in working_filenames: if "zz" in f: new_f = f[2:] else: new_f = f if dt.removeEverythingFromString(e) == dt.removeEverythingFromString(new_f): matching_filenames.append(f) found = True break if not found: failed_fns.append(e) working_filenames = np.unique(np.asarray(matching_filenames)) test_dupes = np.unique(np.asarray(working_filenames)) print(len(test_dupes)) for key, value in phrase_dict.items(): if value >= only_words_in_x_entities: phrase_list.append(key) all_phrases = [] for key, value in phrase_dict.items(): all_phrases.append(key) phrase_sets.append(phrase_list) counter = 0 for phrase_list in phrase_sets: if not get_all and counter > 0: break all_phrase_fn = output_folder+"frequency/phrases/" + "class-all-" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification phrase_name_fn = output_folder + "names/" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification +".txt" phrase_list = sorted(phrase_list) print("Found", len(phrase_list), "Phrases") print(phrase_list[:20]) print("Failed", len(failed_filenames), "Files") print(failed_filenames[:20]) phrase_index_dict = defaultdict() # Create a dictionary to obtain the index of a phrase that's being checked for p in range(len(phrase_list)): phrase_index_dict[phrase_list[p]] = p # Create an empty 2d array to store a matrix of movies and phrases all_phrases_complete = [] for f in working_filenames: all_phrases_complete.append([0]*len(phrase_list)) all_phrases_complete = np.asarray(all_phrases_complete) print("Each entity is length", len(all_phrases_complete[0])) print("The overall matrix is", len(all_phrases_complete)) if sparse_matrix: all_phrases_complete = sp.csr_matrix(all_phrases_complete) # Then, populate the overall bag of words for each film (with all other phrases already set to 0 completed_index = [] if data_type == "wines": print("wines") """ merge_indexes = [] for f in range(len(working_filenames)): print(working_filenames[f]) for i in range(len(working_filenames)): if i == f: continue for ci in completed_index: if i == ci: continue if "~" in working_filenames[i]: if working_filenames[f] == working_filenames[i][:-1] or working_filenames[f] == working_filenames[i][2:-1]: completed_index.append(i) merge_indexes.append((f, i)) """ for f in range(len(working_filenames)): n_phrase_list = dt.import2dArray(input_folder + working_filenames[f] + "." + extension, "s") if cut_first_line: n_phrase_list = n_phrase_list[1:] for p in n_phrase_list: phrase = p[0] try: phrase_index = phrase_index_dict[phrase] if not sparse_matrix: all_phrases_complete[f][phrase_index] = int(p[1]) else: all_phrases_complete[f, phrase_index] = int(p[1]) #print("Kept", phrase) except KeyError: continue #print("Deleted phrase", phrase) """ cols_to_delete = [] if data_type == "wines": for mt in merge_indexes: for v in range(len(all_phrases_complete)): all_phrases_complete[v][mt[0]] += all_phrases_complete[v][mt[1]] cols_to_delete.append(mt[1]) all_phrases_complete = np.delete(all_phrases_complete, cols_to_delete, 1) working_filenames = np.delete(working_filenames, cols_to_delete) """ # Import entities specific to the thing # Trim the phrases of entities that aren't included in the classfication if classification != "all" and classification != "mixed" and classification != "genres" and classification != "ratings" and classification != "types": classification_entities = dt.import1dArray("../data/" + data_type + "/classify/" + classification + "/available_entities.txt") all_phrases_complete = dt.match_entities(all_phrases_complete, classification_entities, file_names) elif classification == "all": print("All~~~~~~~~~~~~~~") dt.write1dArray(working_filenames, "../data/"+data_type+"/classify/"+classification+"/available_entities.txt") if not sparse_matrix: all_phrases_complete = np.asarray(all_phrases_complete).transpose() else: all_phrases_complete = all_phrases_complete.transpose() indexes_to_delete = [] if sparse_matrix: cx = sp.coo_matrix(all_phrases_complete) indexes_to_delete = [] for i, j, v in zip(cx.row, cx.col, cx.data): print "(%d, %d), %s" % (i, j, v) for a in range(len(all_phrases_complete)): if np.count_nonzero(all_phrases_complete[a]) > len(all_phrases_complete[a]) - (words_without_x_entities): print("Recorded an entity " + str(phrase_list[a]) + " with too little difference") indexes_to_delete.append(a) indexes_to_delete.sort() indexes_to_delete.reverse() for i in indexes_to_delete: all_phrases_complete = np.delete(all_phrases_complete, i, 0) print("Deleted an entity " + str(phrase_list[i]) + " with too little difference") phrase_list = np.delete(phrase_list, i, 0) dt.write1dArray(phrase_list, phrase_name_fn) if make_individual: for p in range(len(all_phrases_complete)): dt.write1dArray(all_phrases_complete[p], output_folder+"frequency/phrases/class-" + phrase_list[p] + "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification) dt.write2dArray(all_phrases_complete, all_phrase_fn) print("Created class-all") all_phrases_complete = np.asarray(all_phrases_complete).transpose() for a in range(len(all_phrases_complete)): for v in range(len(all_phrases_complete[a])): if all_phrases_complete[a][v] > 1: all_phrases_complete[a][v] = 1 all_phrases_complete = np.asarray(all_phrases_complete).transpose() if make_individual: for p in range(len(all_phrases_complete)): dt.write1dArray(all_phrases_complete[p], output_folder+"binary/phrases/class-" + phrase_list[p] + "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification) all_phrase_fn = output_folder + "binary/phrases/" + "class-all-" + str( only_words_in_x_entities) + "-" + str(words_without_x_entities) + "-" + classification dt.write2dArray(all_phrases_complete, all_phrase_fn) print("Created class-all binary") counter += 1
def regularNewsgroupsStuff(): # Rename later classification = "all" highest_amt = 18836 lowest_amt = 30 all_fn = "../data/newsgroups/bow/frequency/phrases/class-all-" + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification #newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False, remove=("headers", "footers", "quotes")) #newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False, remove=("headers", "footers", "quotes")) all = fetch_20newsgroups(subset='all', shuffle=False, remove=("headers", "footers", "quotes")) train_len = len(all.data) print(all.target[train_len - 1]) print(all.target[train_len - 2]) print(all.target[train_len - 3]) print(all.target[0]) print(all.target[1]) print(all.target[2]) vectors = all.data classes = all.target ac_x_train = vectors[:11314] ac_x_test = vectors[11314:] ac_y_train = classes[:11314] ac_y_test = classes[11314:] print(classes[train_len - 1]) print(classes[train_len - 2]) print(classes[train_len - 3]) tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english') print("completed vectorizer") tf = tf_vectorizer.fit(vectors) feature_names = tf.get_feature_names() dt.write1dArray( feature_names, "../data/newsgroups/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt") dict = tf.vocabulary_ tf = tf_vectorizer.transform(vectors) dense = FunctionTransformer(lambda x: x.todense(), accept_sparse=True) tf = dense.fit_transform(tf) tf = np.squeeze(np.asarray(tf)) tf = np.asarray(tf, dtype=np.int32) tf = tf.transpose() freqs = [] for t in tf: freq = 0 for i in range(len(t)): if t[i] != 0: freq += t[i] freqs.append(freq) print("Amount of terms:", len(tf)) dt.write1dArray( freqs, "../data/newsgroups/bow/freq_count/" + str(lowest_amt) + "-" + str(highest_amt)) #dt.write2dArray(tf, all_fn) #mt.printIndividualFromAll("newsgroups", "frequency/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=feature_names) ppmi_fn = "../data/newsgroups/bow/ppmi/class-all-" + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification #if dt.fileExists(ppmi_fn) is False: tf = sp.csr_matrix(tf) sp.save_npz(all_fn, tf) ppmi = mt.convertPPMI(tf) #dt.write2dArray(ppmi, ppmi_fn) ppmi_sparse = sp.csr_matrix(ppmi) sp.save_npz(ppmi_fn, ppmi_sparse) mt.printIndividualFromAll("newsgroups", "ppmi", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=feature_names) print("1") classes = np.asarray(classes, dtype=np.int32) print(2) classes_dense = np.zeros(shape=(len(classes), np.amax(classes) + 1), dtype=np.int8) print(3) for c in range(len(classes)): classes_dense[c][classes[c]] = 1 print(4) names = list(all.target_names) dt.write1dArray(names, "../data/newsgroups/classify/newsgroups/names.txt") classes_dense = classes_dense.transpose() for c in range(len(classes_dense)): dt.write1dArray( classes_dense[c], "../data/newsgroups/classify/newsgroups/class-" + names[c]) classes_dense = classes_dense.transpose() dt.write2dArray(classes_dense, "../data/newsgroups/classify/newsgroups/class-all") feature_names = dt.import1dArray("../data/newsgroups/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-all.txt") freq = dt.import2dArray(all_fn) binary = np.zeros(shape=(len(freq), len(freq[0]))) for i in range(len(freq)): for j in range(len(freq[i])): if freq[i][j] > 0: binary[i][j] = 1 binary_all_fn = "../data/newsgroups/bow/binary/phrases/class-all-" + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification binary = sp.csr_matrix(binary) sp.save_npz(binary_all_fn, binary) #dt.write2dArray(binary, binary_all_fn) #mt.printIndividualFromAll("newsgroups", "binary/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=feature_names) #ppmi_fn = "../data/newsgroups/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification #regularNewsgroupsStuff()
def importCertificates(cert_fn, entity_name_fn): all_lines = dt.import1dArray(cert_fn)[14:] en = dt.import1dArray(entity_name_fn) original_en = dt.import1dArray(entity_name_fn) en_name = [] en_year = [] for e in range(len(en)): split = en[e].split() en_year.append(split[len(split)-1]) name = "".join(split[:len(split)-1]) en_name.append(dt.removeEverythingFromString(name)) # Initialize ratings dict """ ratings = { "USA:G": [], "USA:PG": [], "USA:PG-13": [], "USA:R": [] } """ ratings = { "UK:PG": [], "UK:12": [], "UK:12A": [], "UK:15": [], "UK:18": [], } all_ratings = defaultdict(list) recently_found_name = "" recently_found_year = "" recently_found_found = False counter = 0 temp_fn = "../data/temp/uk_cert_dict.pickle" if dt.fileExists(temp_fn) is False: for line in all_lines: line = line.split("\t") split_ny = line[0].split("{")[0] split_ny = split_ny.split() for i in range(len(split_ny)-1, -1, -1): if "{" in split_ny[i]: del split_ny[i] entity_year_bracketed = split_ny[len(split_ny)-1] if "(V)" in entity_year_bracketed or "(TV)" in entity_year_bracketed or "(VG)" in entity_year_bracketed: entity_year_bracketed = split_ny[len(split_ny) - 2] try: entity_year = dt.keepNumbers(entity_year_bracketed)[0] entity_name = dt.removeEverythingFromString("".join(split_ny[:len(split_ny)-1])) found = False skip = False if recently_found_name == entity_name and recently_found_year == entity_year: skip = True found = recently_found_found if not skip: if not found: for n in range(len(en_name)): if entity_name == en_name[n] and entity_year == en_year[n]: print("found", entity_name, entity_year) found = True break if found: if("(" not in line[len(line)-1]): entity_rating = line[len(line)-1] else: entity_rating = line[len(line)-2] all_ratings[entity_rating].append(entity_name) if entity_rating in ratings: ratings[entity_rating].append(entity_name) print("rating correct", entity_name, entity_year, entity_rating) except IndexError: print("IndexError") print(line) print(split_ny) print(entity_year_bracketed) recently_found_name = entity_name recently_found_year = entity_year recently_found_found = found counter += 1 if counter % 1000 == 0: print(counter) # Store data (serialize) with open(temp_fn, 'wb') as handle: pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) # Store data (serialize) with open("../data/temp/uk_cert_dict_all.pickle", 'wb') as handle: pickle.dump(all_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) # Load data (deserialize) with open(temp_fn, 'rb') as handle: ratings = pickle.load(handle) if dt.fileExists("../data/temp/uk_cert_dict_all.pickle"): with open("../data/temp/uk_cert_dict_all.pickle", 'rb') as handle: all_ratings = pickle.load(handle) top_size = 0 for key, value in all_ratings.items(): top_size += len(value) print(top_size) top_size = 0 new_ratings = defaultdict(list) real_name_dict_fn = "../data/temp/uk_real_name_dict.dict" if dt.fileExists(real_name_dict_fn) is False: # Match the names back to the original names for key, value in all_ratings.items(): for r in ratings: if r == key: top_size += len(value) for v in range(len(value)): found = False for n in range(len(en_name)): if value[v] == en_name[n]: found = True value[v] = original_en[n] break if found: new_ratings[key].append(value[v]) break with open(real_name_dict_fn, 'wb') as handle: pickle.dump(new_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(real_name_dict_fn, 'rb') as handle: new_ratings = pickle.load(handle) # Get the final dict setup """ final_dict = { "USA-G": [], "USA-PG-PG13": [], "USA-R": [], } """ final_dict = { "UK-PG": [], "UK-12-12A": [], "UK-15": [], "UK-18": [] } # Append the final dict ratings final_dict["UK-PG"].extend(all_ratings["UK:PG"]) final_dict["UK-12-12A"].extend(all_ratings["UK:12"]) final_dict["UK-12-12A"].extend(all_ratings["UK:12A"]) final_dict["UK-15"].extend(all_ratings["UK:15"]) final_dict["UK-18"].extend(all_ratings["UK:18"]) """ final_dict["USA-G"].extend(all_ratings["USA:G"]) final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG"]) final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG13"]) final_dict["USA-R"].extend(all_ratings["USA:R"]) """ """ final_name_dict = { "USA-G": [], "USA-PG-PG13": [], "USA-R": [], } """ final_name_dict = { "UK-PG": [], "UK-12-12A": [], "UK-15": [], "UK-18": [], } # Append the final dict good names final_name_dict["UK-PG"].extend(new_ratings["UK:PG"]) final_name_dict["UK-12-12A"].extend(new_ratings["UK:12"]) final_name_dict["UK-12-12A"].extend(new_ratings["UK:12A"]) final_name_dict["UK-15"].extend(new_ratings["UK:15"]) final_name_dict["UK-18"].extend(new_ratings["UK:18"]) """ final_name_dict["USA-G"].extend(new_ratings["USA:G"]) final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG"]) final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG13"]) final_name_dict["USA-R"].extend(new_ratings["USA:R"]) """ # Create a unique list of the entities found entities_found = [] for key, items in new_ratings.items(): for i in items: entities_found.append(i) entities_found = np.unique(entities_found) print(len(entities_found)) # Get the en_names back... jacked_up_entities_found = [] for n in entities_found: new_n = n.split()[:-1] jacked_up_entities_found.append(dt.removeEverythingFromString(" ".join(new_n))) classes = [[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found)] counter = 0 class_names = [] for key, items in final_dict.items(): for i in items: for e in range(len(jacked_up_entities_found)): if i == jacked_up_entities_found[e]: classes[counter][e] = 1 class_names.append(key) counter += 1 classes = np.asarray(classes).transpose() indexes_to_delete = [] for c in range(len(classes)): found = False for i in classes[c]: if i == 1: found = True break if not found: indexes_to_delete.append(c) classes = np.delete(classes, indexes_to_delete, axis=0) entities_found = np.delete(entities_found, indexes_to_delete) classes = classes.transpose() for c in range(len(classes)): dt.write1dArray(classes[c], "../data/movies/classify/uk-ratings/class-" + class_names[c]) classes = classes.transpose() dt.write2dArray(classes, "../data/movies/classify/uk-ratings/class-all") dt.write1dArray(entities_found, "../data/movies/classify/uk-ratings/available_entities.txt") dt.write1dArray(class_names, "../data/movies/classify/uk-ratings/names.txt") print("k")