def train_clustering_parameters(vector_inpath): """ Functions that tries to figure out the optimal clustering parameters in regard to DBSCAN's epsilon, min_samples and p. Args: vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces): <index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n> """ x0 = numpy.array([2.5, 20, 2]) # First parameters print alt("Load mappings...") indices, model = load_mappings_from_model(vector_inpath) X = numpy.array([model[key] for key in indices]) # Arrange data for optimization print alt("Start training...") # Cluster data and calculate loss as silhouette coefficient per cluster def simple_clustering(x): # Start clustering print alt("Current parameters: %s" % (str(x))) dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2]) dbscan.fit(X) # Evaluate results cluster_sizes = get_cluster_size(dbscan.labels_) print alt("Current cluster sizes: %s" % (cluster_sizes)) sscore = silhouette_score(X, dbscan.labels_) tscore = (sscore / (len(cluster_sizes.keys()) - 1)) print alt("Current value of objective function: %.5f" % (tscore)) print "-" * 50 return tscore # Start minimizing result = minimize(simple_clustering, x0, method="Nelder-Mead") print result.x # Print resulting parameter configuration
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.625, min_s=20): """ Cluster mapping vectors created with :py:mod:`src.mapping.mapthreading` or :py:mod:`rc.mapping.map_vectors.py`. Because just reading about the number of clusters and their sizes, there's an option to resolve the indices of the vectors in the cluster to their original word pairs. Args: vector_inpath (str): Path to vector file. File should have the following format (separated by spaces): <index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n> do_pca (bool): Flag to indicate whether PCA should be executed before clustering to reduce amount of computation. target_dim (int): Number of dimensions vectors should be shrunk to in case PCA is performed. indices_inpath (str): Path to file with the indices given to words. The file should have the following format: <index of word> <word> (separated by tab) epsilon (float): Radius of circle DBSCAN uses to look for other data points. min_s (int): Minimum number of points in radius epsilon DBSCAN needs to declare a point a core object. """ print alt("Load mappings...") indices, model = load_mappings_from_model(vector_inpath) X = numpy.array([model[key] for key in indices]) del model # free up memory # do PCA if wanted if do_pca: print alt("Truncate vectors with PCA to %i dimensions..." % target_dim) pca = PCA(n_components=target_dim) pca.fit(X) X = pca.transform(X) # Start clustering print alt("Cluster points...") dbscan = DBSCAN(eps=epsilon, min_samples=min_s, p=2) dbscan.fit(X) # Get results labels = dbscan.labels_ print alt("Cluster sizes:") print get_cluster_size(labels) print alt("Finished clustering!") sscore = silhouette_score(X, labels) print("Silhouette Coefficient: %0.3f" % (sscore)) # Resolve indices and print all word-pairs in clusters if wanted if indices_inpath: resolve_indices(indices, labels, indices_inpath, model)
def convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes, log_interval): """ Convert the whole corpus into plain text. Args: decow_dir (str): Path to directory with decow corpus paths. out_dir (str): Path where plain text parts should be written to. log_path (str): Path where the log files should be written to. merge_nes (bool): Flag to indicate whether multi-word expression should be merged with underscores. log_interval (int): Interval to log current process state in seconds. """ # Split logging interval into hourse - minutes - seconds m_proc, s_proc = divmod(log_interval, 60) h_proc, m_proc = divmod(m_proc, 60) # Init logfile with codecs.open(log_path, "a", "utf-8") as log_file: log_file.write(alt("Starting logging...\n")) log_file.write(alt("Corpus (parts) directory:\t%s\n" % decow_dir)) log_file.write(alt("Output directory:\t\t%s\n" % out_dir)) log_file.write(alt("Logging path:\t\t%s\n" % log_path)) log_file.write( alt("Logging intervals:\n\t Every %2dh %2dm %2ds for metalog\n" % (h_proc, m_proc, s_proc))) # Start processes @log_time(log_path, log_interval) def _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes, log_interval): with codecs.open(log_path, "a", "utf-8") as log_file: log_file.write( alt("Preparing %i process(es)...\n" % (len(decow_dir)))) inpaths = [ path for path in os.listdir(decow_dir) if ".DS_Store" not in path and "decow" in path ] pool = multiprocessing.Pool(processes=len(inpaths)) log_file.write(alt("Starting process(es)!\n")) if merge_nes: pool.map(convert_part_merging, [(decow_dir + inpath, out_dir, log_path, log_interval) for inpath in inpaths]) else: pool.map(convert_part, [(decow_dir + inpath, out_dir, log_path, log_interval) for inpath in inpaths]) _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes, log_interval)
def _convert_part_merging(inpath, dir_outpath, log_path): with codecs.open(log_path, "a", "utf-8") as log_file: process_name = multiprocessing.current_process().name log_file.write( alt("%s: Start logging processing of\n\t%s to \n\t%s...\n" % (process_name, inpath, dir_outpath))) file_n = get_file_number(inpath) outpath = dir_outpath + 'decow%s_out.txt' % (str(file_n)) with gz.open(inpath, 'rb') as infile, codecs.open(outpath, 'wb', 'utf-8') as outfile: sentence = [] line, lcount = infile.readline().strip().decode("utf-8"), 1 while line != "": if lcount % 100000 == 0: log_file.write( alt("%s: Processing line nr. %i...\n" % (process_name, lcount))) ne = extract_named_entity( line) # Extract possible named entity if line.startswith(u'<s'): outfile.write('%s\n' % (' '.join(sentence))) sentence = [] # If there was a named entity found, try to complete it if it's a multi-word expression elif ne is not None: while True: next_line = infile.readline().strip().decode( "utf-8") lcount += 1 if not contains_tag(next_line): next_ne = extract_named_entity(next_line) if next_ne is not None and next_ne[1] == ne[1]: ne = ("%s_%s" % (ne[0], next_ne[0]), ne[1]) else: break else: break sentence.append(ne[0]) line = next_line continue elif not line.startswith(u'<'): sentence.append(line.split('\t')[0]) line, lcount = infile.readline().strip().decode( "utf-8"), lcount + 1
def load_indices(indices_inpath): """ Load word indices from a file. The file should have the following format: <index of word> <word> (separated by tab) Args: indices_inpath (str): Path to index file. """ print alt("Load indices...") indices = defaultdict(str) with codecs.open(indices_inpath, "rb", "utf-8") as indices_inpath: line = indices_inpath.readline().strip() while line: parts = line.split("\t") index = int(parts[0]) word = parts[1].replace(" ", "_") indices[index] = word line = indices_inpath.readline().strip() return indices
def _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes, log_interval): with codecs.open(log_path, "a", "utf-8") as log_file: log_file.write( alt("Preparing %i process(es)...\n" % (len(decow_dir)))) inpaths = [ path for path in os.listdir(decow_dir) if ".DS_Store" not in path and "decow" in path ] pool = multiprocessing.Pool(processes=len(inpaths)) log_file.write(alt("Starting process(es)!\n")) if merge_nes: pool.map(convert_part_merging, [(decow_dir + inpath, out_dir, log_path, log_interval) for inpath in inpaths]) else: pool.map(convert_part, [(decow_dir + inpath, out_dir, log_path, log_interval) for inpath in inpaths])
def aggregate_cluster(points, labels): """ Arranges all clusters in a list, where a sublist with all points at index i corresponds with the custer with label i. Args: points (list): List of datapoints labels (list): List of unique cluster labels Returns: list: list of lists of datapoints belonging to the i-th cluster """ print alt("Aggregate clusters...") clusters = defaultdict(tuple) for i in range(len(labels)): label = labels[i] if label == -1: continue if label in clusters.keys(): clusters[label].append(points[i]) else: clusters[label] = [points[i]] return clusters
def word_sim_eval(vector_inpath, wordpair_path, format="google"): """ Function that let's the system assign word pairs a similarity score based on the cosine similarity of their word embeddings. Then, to correlation between those and human ratings is measured with Pearson's rho. Args: vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces): <index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n> wordpair_path (str): Path to word pair file. format (str): Format of word pair file {google|semrel} """ alt("Loading model...\n") model = load_vectors_from_model(vector_inpath)[1] # Read word pairs with values alt("Loading word pairs...\n") raw_pairs, x = read_wordpairs(wordpair_path, format) y = [None] * len(raw_pairs) pairs = [] for i in range(len(raw_pairs)): pairs.append((i, raw_pairs[i])) # Calculate similarity values for pairs alt("Calculating word pair similarities...\n") error_counter = 0 for pair_id, pair in pairs: try: a = model[capitalize(pair[0])] b = model[capitalize(pair[1])] sim = cosine(a, b) y[pair_id] = sim except TypeError: error_counter += 1 # Remove pairs where no word embedding was found x, y = remove_unknowns(x, y) # Calculate results rho, t, z = evaluate_wordpair_sims(x, y, len(pairs)) successful_pairs = len(pairs) - error_counter successful_percentage = (len(pairs) - error_counter * 1.0) / len(pairs) * 100.0 alt("Calculated Pearman's rho for %i pairs (%.2f %%).\n\tr = %.4f\n\tt = %.4f\n\tz = %.4f\n" % (successful_pairs, successful_percentage, rho, t, z))
def simple_clustering(x): # Start clustering print alt("Current parameters: %s" % (str(x))) dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2]) dbscan.fit(X) # Evaluate results cluster_sizes = get_cluster_size(dbscan.labels_) print alt("Current cluster sizes: %s" % (cluster_sizes)) sscore = silhouette_score(X, dbscan.labels_) tscore = (sscore / (len(cluster_sizes.keys()) - 1)) print alt("Current value of objective function: %.5f" % (tscore)) print "-" * 50 return tscore
def wrapper(*args, **kwargs): result = [None] def return_value(*args, **kwargs): result[0] = func(*args, **kwargs) t = threading.Thread(target=return_value, args=args, kwargs=kwargs) log_entries = 0 with codecs.open(logpath, "a", "utf-8") as logfile: start_time = time.time() t.start() while t.is_alive(): elapsed_time = (time.time() - start_time) if elapsed_time > interval * log_entries: m, s = divmod(elapsed_time, 60) h, m = divmod(m, 60) logfile.write(alt("Elapsed time for function '%s': %2dh %2dm %2ds\n" %(func.__name__, h, m, s))) log_entries += 1 return result[0]